Tools

`lazyllm.tools.agent.code_interpreter`

`code_interpreter(code, language='python')`

Interpret the code and return the code interpreter result (include stdout, stderr, returncode, etc.).

Parameters:

code (str) –

The code to interpret.
language (str, default: 'python' ) –

The language of the code. Default is 'python'.

Source code in lazyllm/tools/agent/code_interpreter.py

@fc_register('tool', execute_in_sandbox=False)
def code_interpreter(code: str, language: str = 'python') -> str:
    """
    Interpret the code and return the code interpreter result (include stdout, stderr, returncode, etc.).

    Args:
        code (str): The code to interpret.
        language (str): The language of the code. Default is 'python'.
    """
    call_once(_sandbox_once, _create_sandbox)
    return _sandbox(code=code, language=language)

`lazyllm.tools.sandbox.LazyLLMSandboxBase`

Bases: ModuleBase

Base class for sandbox execution with a unified call interface and language validation.

Parameters:

output_dir_path (str | None, default: None ) –

output directory for generated files, default is cwd.
return_trace (bool, default: False ) –

whether to return intermediate execution info (controlled by ModuleBase).

Notes

Subclasses must implement _is_available and _execute.

Source code in lazyllm/tools/sandbox/sandbox_base.py

class LazyLLMSandboxBase(ModuleBase, metaclass=LazyLLMRegisterMetaClass):
    """Base class for sandbox execution with a unified call interface and language validation.

Args:
    output_dir_path (str | None): output directory for generated files, default is cwd.
    return_trace (bool): whether to return intermediate execution info (controlled by ModuleBase).

Notes:
    Subclasses must implement `_is_available` and `_execute`.
"""
    SUPPORTED_LANGUAGES: List[str] = []

    def __init__(self, output_dir_path: Optional[str] = None, return_trace: bool = False,
                 project_dir: Optional[str] = None, return_sandbox_result: bool = False):
        super().__init__(return_trace=return_trace)
        self._output_dir_path = output_dir_path or os.getcwd()
        self._project_dir = project_dir
        if self._project_dir and not os.path.isdir(self._project_dir):
            raise FileNotFoundError(f'Project directory not found: {self._project_dir}')
        self._available_checked = False
        self._return_sandbox_result = return_sandbox_result

    def _check_available(self) -> None:
        raise NotImplementedError

    def _create_context(self) -> dict:
        raise NotImplementedError

    def _execute(self, code: str, language: str, context: dict,
                 output_files: Optional[List[str]] = None) -> _SandboxResult:
        raise NotImplementedError

    def _process_input_files(self, input_files: List[str], context: dict) -> None:
        raise NotImplementedError

    def _process_output_files(self, result: _SandboxResult, output_files: List[str], context: dict) -> List[str]:
        raise NotImplementedError

    def _process_project_dir(self, context: dict) -> None:
        raise NotImplementedError

    def _cleanup_context(self, context: dict) -> None:
        pass

    def _validate_input_files(self, input_files: Optional[List[str]]) -> Optional[List[str]]:
        if not input_files:
            return input_files
        for f in input_files:
            if not os.path.isfile(f):
                raise FileNotFoundError(f'Input file not found: {f}')

    def _collect_project_py_files(self) -> Generator[Tuple[str, str], None, None]:
        if not self._project_dir:
            return
        abs_dir = os.path.abspath(self._project_dir)
        for root, _, files in os.walk(abs_dir):
            for name in files:
                if name.endswith('.py'):
                    abs_path = os.path.join(root, name)
                    yield abs_path, os.path.relpath(abs_path, abs_dir)

    def _ensure_output_dir(self) -> None:
        os.makedirs(self._output_dir_path, exist_ok=True)

    def forward(self, code: str, language: str = 'python', input_files: Optional[List[str]] = None,
                output_files: Optional[List[str]] = None) -> dict:
        """Unified execution entry that validates language and delegates to the implementation.

Args:
    code (str): code to execute.
    language (str): code language, default 'python'.
    input_files (list[str] | None): optional list of input file paths.
    output_files (list[str] | None): optional list of output files to fetch.

**Returns:**

    Result produced by the sandbox implementation (usually a dict or an error message string).
"""
        if not self._available_checked:
            self._check_available()
            self._available_checked = True

        if language not in self.SUPPORTED_LANGUAGES:
            raise ValueError(f'Language {language} not supported by {self.__class__.__name__}')
        self._validate_input_files(input_files)

        context = self._create_context()
        try:
            if self._project_dir:
                self._process_project_dir(context)
            if input_files:
                self._process_input_files(input_files, context)

            result = self._execute(code, language, context, output_files)

            if output_files and result.success:
                result.output_files = self._process_output_files(result, output_files, context)

            if self._return_sandbox_result:
                return result.to_dict()
            else:
                if result.success:
                    match = re.search(rf'^{SANDBOX_TOOL_RESULT_PREFIX}(.*)', result.stdout, re.MULTILINE)
                    return match.group(1) if match else result.stdout
                return result.stderr
        finally:
            self._cleanup_context(context)

`forward(code, language='python', input_files=None, output_files=None)`

Unified execution entry that validates language and delegates to the implementation.

Parameters:

code (str) –

code to execute.
language (str, default: 'python' ) –

code language, default 'python'.
input_files (list[str] | None, default: None ) –

optional list of input file paths.
output_files (list[str] | None, default: None ) –

optional list of output files to fetch.

Returns:

Result produced by the sandbox implementation (usually a dict or an error message string).

Source code in lazyllm/tools/sandbox/sandbox_base.py

    def forward(self, code: str, language: str = 'python', input_files: Optional[List[str]] = None,
                output_files: Optional[List[str]] = None) -> dict:
        """Unified execution entry that validates language and delegates to the implementation.

Args:
    code (str): code to execute.
    language (str): code language, default 'python'.
    input_files (list[str] | None): optional list of input file paths.
    output_files (list[str] | None): optional list of output files to fetch.

**Returns:**

    Result produced by the sandbox implementation (usually a dict or an error message string).
"""
        if not self._available_checked:
            self._check_available()
            self._available_checked = True

        if language not in self.SUPPORTED_LANGUAGES:
            raise ValueError(f'Language {language} not supported by {self.__class__.__name__}')
        self._validate_input_files(input_files)

        context = self._create_context()
        try:
            if self._project_dir:
                self._process_project_dir(context)
            if input_files:
                self._process_input_files(input_files, context)

            result = self._execute(code, language, context, output_files)

            if output_files and result.success:
                result.output_files = self._process_output_files(result, output_files, context)

            if self._return_sandbox_result:
                return result.to_dict()
            else:
                if result.success:
                    match = re.search(rf'^{SANDBOX_TOOL_RESULT_PREFIX}(.*)', result.stdout, re.MULTILINE)
                    return match.group(1) if match else result.stdout
                return result.stderr
        finally:
            self._cleanup_context(context)

`lazyllm.tools.sandbox.DummySandbox`

Bases: LazyLLMSandboxBase

Local sandbox implementation (python-only) for executing code in a restricted environment.

Features: - Basic safety checks with AST + SecurityVisitor. - Runs code in a temp directory and cleans up afterwards. - Returns a dict with stdout/stderr/returncode.

Parameters:

timeout (int, default: 30 ) –

timeout in seconds, default 30.
project_dir (str | None, default: None ) –

if provided, copies .py files into sandbox for imports.
return_trace (bool, default: False ) –

whether to return intermediate execution info.

Examples:

>>> from lazyllm.tools.sandbox import DummySandbox
>>> sandbox = DummySandbox(timeout=10)
>>> result = sandbox(code="print(1 + 1)")
>>> print(result['stdout'].strip())
2

Source code in lazyllm/tools/sandbox/dummy_sandbox.py

class DummySandbox(LazyLLMSandboxBase):
    """Local sandbox implementation (python-only) for executing code in a restricted environment.

Features:
- Basic safety checks with AST + SecurityVisitor.
- Runs code in a temp directory and cleans up afterwards.
- Returns a dict with stdout/stderr/returncode.

Args:
    timeout (int): timeout in seconds, default 30.
    project_dir (str | None): if provided, copies .py files into sandbox for imports.
    return_trace (bool): whether to return intermediate execution info.


Examples:
    >>> from lazyllm.tools.sandbox import DummySandbox
    >>> sandbox = DummySandbox(timeout=10)
    >>> result = sandbox(code="print(1 + 1)")
    >>> print(result['stdout'].strip())
    2
    """
    SUPPORTED_LANGUAGES: List[str] = ['python']

    def __init__(self, timeout: int = 30, return_trace: bool = False, project_dir: Optional[str] = None,
                 return_sandbox_result: bool = False):
        super().__init__(return_trace=return_trace, project_dir=project_dir,
                         return_sandbox_result=return_sandbox_result)
        self._timeout = timeout

    def _check_available(self) -> bool:
        return True

    def _check_code_safety(self, code: str) -> Tuple[bool, Optional[str]]:
        try:
            tree = ast.parse(code)
        except SyntaxError as e:
            return False, f'Syntax error: {e}'
        try:
            SecurityVisitor().visit(tree)
        except ValueError as e:
            return False, str(e)
        return True, None

    def _run_in_subprocess(self, script_path: str, cwd: str,
                           env: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
        proc = subprocess.Popen(
            [sys.executable, '-u', script_path],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE,
            cwd=cwd, env=env or os.environ.copy(), text=True, bufsize=1,
        )
        try:
            stdout, stderr = proc.communicate(timeout=self._timeout)
        except subprocess.TimeoutExpired:
            proc.kill()
            proc.wait()
            raise
        return {'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}

    def _create_context(self) -> dict:
        return {'temp_dir': tempfile.mkdtemp(prefix='lazyllm_sandbox_')}

    def _cleanup_context(self, context: dict) -> None:
        temp_dir = context.get('temp_dir')
        if temp_dir:
            shutil.rmtree(temp_dir, ignore_errors=True)

    def _process_input_files(self, input_files: List[str], context: dict) -> None:
        for f in input_files:
            try:
                shutil.copy(f, context['temp_dir'])
            except Exception as e:
                LOG.warning(f'DummySandbox: failed to copy input file {f!r}: {e}')

    def _process_project_dir(self, context: dict) -> None:
        temp_dir = context['temp_dir']
        for abs_path, rel_path in self._collect_project_py_files():
            dst = os.path.join(temp_dir, rel_path)
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.copy(abs_path, dst)

    def _process_output_files(self, result: _SandboxResult, output_files: List[str], context: dict) -> List[str]:
        self._ensure_output_dir()
        collected = []
        for name in output_files:
            src = os.path.join(context['temp_dir'], name)
            dst = os.path.join(self._output_dir_path, name)
            try:
                if os.path.exists(src):
                    shutil.move(src, dst)
                    collected.append(dst)
            except Exception as e:
                LOG.warning(f'DummySandbox: failed to move output file {src!r}: {e}')
        return collected

    def _execute(self, code: str, language: str, context: dict,
                 output_files: Optional[List[str]] = None) -> _SandboxResult:
        is_safe, msg = self._check_code_safety(code)
        if not is_safe:
            return _SandboxResult(success=False, error_message=msg)

        temp_dir = context['temp_dir']
        try:
            script_path = os.path.join(temp_dir, '_script.py')
            with open(script_path, 'w', encoding='utf-8') as f:
                f.write(code)
            proc_result = self._run_in_subprocess(script_path, cwd=temp_dir)
            return _SandboxResult(
                success=(proc_result['returncode'] == 0),
                stdout=proc_result['stdout'],
                stderr=proc_result['stderr'],
                returncode=proc_result['returncode'],
            )
        except subprocess.TimeoutExpired:
            return _SandboxResult(success=False, error_message=f'Execution timed out after {self._timeout} seconds')
        except Exception as e:
            return _SandboxResult(success=False, error_message=str(e))

`lazyllm.tools.sandbox.SandboxFusion`

Bases: LazyLLMSandboxBase

Remote sandbox implementation that executes code via HTTP API.

Supports python / bash. Configurable compile/run timeouts and memory limits. Can upload project files and fetch output files.

Parameters:

base_url (str, default: config['sandbox_fusion_base_url'] ) –

remote sandbox base URL, defaults to config['sandbox_fusion_base_url'].
compile_timeout (int, default: 10 ) –

compile timeout in seconds, default 10.
run_timeout (int, default: 10 ) –

run timeout in seconds, default 10.
memory_limit_mb (int, default: -1 ) –

memory limit in MB, -1 means no limit.
project_dir (str | None, default: None ) –

if provided, uploads .py files from the project directory.

Notes

Set LAZYLLM_SANDBOX_FUSION_BASE_URL or pass base_url explicitly.

Examples:

>>> from lazyllm import config
>>> from lazyllm.tools.sandbox import SandboxFusion
>>> config['sandbox_fusion_base_url'] = "http://localhost:8000"
>>> sandbox = SandboxFusion(run_timeout=5)
>>> result = sandbox(code="print('ok')")
>>> print(result['stdout'].strip())
ok

Source code in lazyllm/tools/sandbox/sandbox_fusion.py

class SandboxFusion(LazyLLMSandboxBase):
    """Remote sandbox implementation that executes code via HTTP API.

Supports python / bash. Configurable compile/run timeouts and memory limits. Can upload project files and fetch output files.

Args:
    base_url (str): remote sandbox base URL, defaults to config['sandbox_fusion_base_url'].
    compile_timeout (int): compile timeout in seconds, default 10.
    run_timeout (int): run timeout in seconds, default 10.
    memory_limit_mb (int): memory limit in MB, -1 means no limit.
    project_dir (str | None): if provided, uploads .py files from the project directory.

Notes:
    Set LAZYLLM_SANDBOX_FUSION_BASE_URL or pass base_url explicitly.


Examples:
    >>> from lazyllm import config
    >>> from lazyllm.tools.sandbox import SandboxFusion
    >>> config['sandbox_fusion_base_url'] = "http://localhost:8000"
    >>> sandbox = SandboxFusion(run_timeout=5)
    >>> result = sandbox(code="print('ok')")
    >>> print(result['stdout'].strip())
    ok
    """
    __lazyllm_registry_key__ = 'sandbox_fusion'
    SUPPORTED_LANGUAGES: List[str] = ['python', 'bash']

    def __init__(self, base_url: str = config['sandbox_fusion_base_url'], compile_timeout: int = 10,
                 run_timeout: int = 10, memory_limit_mb: int = -1, project_dir: str = None,
                 return_sandbox_result: bool = False, return_trace: bool = False):
        super().__init__(return_trace=return_trace, project_dir=project_dir, return_sandbox_result=return_sandbox_result)
        self._base_url = base_url
        self._compile_timeout = compile_timeout
        self._run_timeout = run_timeout
        self._memory_limit_mb = memory_limit_mb
        self._project_files_cache = None

    @property
    def url(self) -> str:
        return f'{self._base_url}/run_code'

    def _check_available(self) -> None:
        try:
            resp = requests.get(f'{self._base_url}/v1/ping', timeout=2)
            if resp.status_code != 200:
                raise ValueError(f'SandboxFusion ping failed: status={resp.status_code}, text={resp.text}')
        except Exception as e:
            raise ValueError(f'SandboxFusion _check_available error: {e}')

    def _call_api(self, call_params: dict[str, Any]):
        headers = {'Content-Type': 'application/json', 'Accept': 'application/json'}
        try:
            resp = requests.post(self.url, headers=headers, json=call_params)
            resp.raise_for_status()
            return resp.json()
        except req_exc.RequestException as e:
            LOG.error(f'API Request Error: {e}')
        except JSONDecodeError as e:
            LOG.error(f'API Response JSON Decode Error: {e}')
        except Exception as e:
            LOG.exception(f'Unexpected Error: {e}')
        return 'Sandbox API Call Failed'

    def _create_context(self) -> dict:
        return {'files': {}}

    def _process_input_files(self, input_files: List[str], context: dict) -> None:
        for f in input_files:
            context['files'][f] = self._encode_file_base64(f)

    def _process_project_dir(self, context: dict) -> None:
        if self._project_files_cache is None:
            self._project_files_cache = {
                rel: self._encode_file_base64(abs_p)
                for abs_p, rel in self._collect_project_py_files()
            }
        context['files'].update(self._project_files_cache)

    def _process_output_files(self, result: _SandboxResult, output_files: List[str], context: dict) -> List[str]:
        self._ensure_output_dir()
        response_files = context.get('response_files') or {}
        collected = []
        for name in output_files:
            b64 = response_files.get(name)
            if b64 is None:
                LOG.warning(f'SandboxFusion: requested output file {name!r} not found in response')
                continue
            path = os.path.join(self._output_dir_path, name)
            with open(path, 'wb') as f:
                f.write(base64.b64decode(b64))
            collected.append(path)
        return collected

    def _execute(self, code: str, language: str, context: dict,
                 output_files: Optional[List[str]] = None) -> _SandboxResult:
        call_params = {
            'code': code,
            'compile_timeout': self._compile_timeout,
            'run_timeout': self._run_timeout,
            'memory_limit_mb': self._memory_limit_mb,
            'language': language,
            'files': context['files'],
        }
        if output_files:
            call_params['fetch_files'] = output_files

        response = self._call_api(call_params)
        if isinstance(response, str):
            return _SandboxResult(success=False, error_message=response)

        context['response_files'] = response.get('files') or {}
        run_result = response.get('run_result') or {}
        returncode = run_result.get('return_code', -1)
        return _SandboxResult(
            success=(response.get('status') == 'Success' and returncode == 0),
            stdout=run_result.get('stdout', ''),
            stderr=run_result.get('stderr', ''),
            returncode=returncode,
        )

    @staticmethod
    def _encode_file_base64(path: str) -> str:
        encoded = file_to_base64(path)
        if encoded is None:
            raise ValueError(f'Failed to encode file to base64: {path}')
        return encoded[0]

`lazyllm.tools.IntentClassifier`

Bases: ModuleBase

Intent classification module that classifies input text into a given intent list. Supports automatic selection of Chinese or English prompt templates, and allows enhancement through examples, prompt text, constraints, and attention notes.

Parameters:

llm –

The large language model instance used for intent classification.
intent_list (list, default: None ) –

Optional, list of intent categories, e.g., ["chat", "weather", "QA"].
prompt (str, default: '' ) –

Optional, custom prompt inserted into the system prompt template.
constrain (str, default: '' ) –

Optional, classification constraint description.
attention (str, default: '' ) –

Optional, attention notes for classification.
examples (list[list[str, str]], default: None ) –

Optional, classification examples, each element is [input text, label].
return_trace (bool, default: False ) –

Whether to return execution trace. Default is False.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import IntentClassifier
>>> classifier_llm = lazyllm.OnlineChatModule(source="openai")
>>> chatflow_intent_list = ["Chat", "Financial Knowledge Q&A", "Employee Information Query", "Weather Query"]
>>> classifier = IntentClassifier(classifier_llm, intent_list=chatflow_intent_list)
>>> classifier.start()
>>> print(classifier('What is the weather today'))
Weather Query
>>>
>>> with IntentClassifier(classifier_llm) as ic:
>>>     ic.case['Weather Query', lambda x: '38.5°C']
>>>     ic.case['Chat', lambda x: 'permission denied']
>>>     ic.case['Financial Knowledge Q&A', lambda x: 'Calling Financial RAG']
>>>     ic.case['Employee Information Query', lambda x: 'Beijing']
...
>>> ic.start()
>>> print(ic('What is the weather today'))
38.5°C

Source code in lazyllm/tools/classifier/intent_classifier.py

class IntentClassifier(ModuleBase):
    """Intent classification module that classifies input text into a given intent list.
Supports automatic selection of Chinese or English prompt templates, and allows enhancement through examples, prompt text, constraints, and attention notes.

Args:
    llm: The large language model instance used for intent classification.
    intent_list (list): Optional, list of intent categories, e.g., ["chat", "weather", "QA"].
    prompt (str): Optional, custom prompt inserted into the system prompt template.
    constrain (str): Optional, classification constraint description.
    attention (str): Optional, attention notes for classification.
    examples (list[list[str, str]]): Optional, classification examples, each element is [input text, label].
    return_trace (bool): Whether to return execution trace. Default is False.


Examples:
        >>> import lazyllm
        >>> from lazyllm.tools import IntentClassifier
        >>> classifier_llm = lazyllm.OnlineChatModule(source="openai")
        >>> chatflow_intent_list = ["Chat", "Financial Knowledge Q&A", "Employee Information Query", "Weather Query"]
        >>> classifier = IntentClassifier(classifier_llm, intent_list=chatflow_intent_list)
        >>> classifier.start()
        >>> print(classifier('What is the weather today'))
        Weather Query
        >>>
        >>> with IntentClassifier(classifier_llm) as ic:
        >>>     ic.case['Weather Query', lambda x: '38.5°C']
        >>>     ic.case['Chat', lambda x: 'permission denied']
        >>>     ic.case['Financial Knowledge Q&A', lambda x: 'Calling Financial RAG']
        >>>     ic.case['Employee Information Query', lambda x: 'Beijing']
        ...
        >>> ic.start()
        >>> print(ic('What is the weather today'))
        38.5°C
    """
    def __init__(self, llm, intent_list: list = None,
                 *, prompt: str = '', constrain: str = '', attention: str = '',
                 examples: Optional[list[list[str, str]]] = None, return_trace: bool = False) -> None:
        super().__init__(return_trace=return_trace)
        self._intent_list = intent_list or []
        self._llm = llm
        self._prompt, self._constrain, self._attention, self._examples = prompt, constrain, attention, examples or []
        if self._intent_list:
            self._init()

    def _init(self):
        def choose_prompt():
            # Use chinese prompt if intent elements have chinese character, otherwise use english version
            for ele in self._intent_list:
                for ch in ele:
                    # chinese unicode range
                    if '\u4e00' <= ch <= '\u9fff':
                        return ch_prompt_classifier_template
            return en_prompt_classifier_template

        example_template = '\nUser: {{{{"human_input": "{inp}", "intent_list": {intent}}}}}\nAssistant: {label}\n'
        examples = ''.join([example_template.format(
            inp=input, intent=self._intent_list, label=label) for input, label in self._examples])
        prompt = choose_prompt().replace(
            '{user_prompt}', f' {self._prompt}').replace('{attention}', self._attention).replace(
            '{user_constrains}', f' {self._constrain}').replace('{user_examples}', f' {examples}')
        self._llm = self._llm.share(prompt=AlpacaPrompter(dict(system=prompt, user='${input}')
                                                          ).pre_hook(self.intent_promt_hook)).used_by(self._module_id)
        self._impl = pipeline(self._llm, self.post_process_result)

    def intent_promt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: List[Union[List[str], Dict[str, Any]]] = [],  # noqa B006
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Pre-processing hook for intent classification.
Packages the input text and intent list into JSON and generates a string of conversation history.

Args:
    input (str | List | Dict | None): The input text, only string type is supported.
    history (List): Conversation history, default empty list.
    tools (List[Dict] | None): Optional tool information.
    label (str | None): Optional label.

**Returns:**

- tuple: input data dict, history list, tools, label
"""
        input_json = {}
        if isinstance(input, str):
            input_json = {'human_input': input, 'intent_list': self._intent_list}
        else:
            raise ValueError(f'Unexpected type for input: {type(input)}')

        history_info = chat_history_to_str(history)
        history = []
        input_text = json.dumps(input_json, ensure_ascii=False)
        return dict(history_info=history_info, input=input_text), history, tools, label

    def post_process_result(self, input):
        """Post-processing of intent classification result.
Returns the result directly if it is in the intent list, otherwise returns the first element of the intent list.

Args:
    input (str): Output result from the classification model.

**Returns:**

- str: The final classification label.
"""
        input = input.strip()
        return input if input in self._intent_list else self._intent_list[0]

    def forward(self, input: str, llm_chat_history: List[Dict[str, Any]] = None):
        if llm_chat_history is not None and self._llm._module_id not in globals['chat_history']:
            globals['chat_history'][self._llm._module_id] = llm_chat_history
        return self._impl(input)

    def __enter__(self):
        assert not self._intent_list, 'Intent list is already set'
        self._sw = switch()
        self._sw.__enter__()
        return self

    @property
    def case(self):
        return switch.Case(self)

    @property
    def submodules(self):
        submodule = []
        if isinstance(self._impl, switch):
            self._impl.for_each(lambda x: isinstance(x, ModuleBase), lambda x: submodule.append(x))
        return super().submodules + submodule

    # used by switch.Case
    def _add_case(self, cond, func):
        assert isinstance(cond, str), 'intent must be string'
        self._intent_list.append(cond)
        self._sw.case[cond, func]

    def __exit__(self, exc_type, exc_val, exc_tb):
        self._sw.__exit__(exc_type, exc_val, exc_tb)
        self._init()
        self._sw._set_conversion(self._impl)
        self._impl = self._sw

`intent_promt_hook(input=None, history=[], tools=None, label=None)`

Pre-processing hook for intent classification. Packages the input text and intent list into JSON and generates a string of conversation history.

Parameters:

input (str | List | Dict | None, default: None ) –

The input text, only string type is supported.
history (List, default: [] ) –

Conversation history, default empty list.
tools (List[Dict] | None, default: None ) –

Optional tool information.
label (str | None, default: None ) –

Optional label.

Returns:

tuple: input data dict, history list, tools, label

Source code in lazyllm/tools/classifier/intent_classifier.py

    def intent_promt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: List[Union[List[str], Dict[str, Any]]] = [],  # noqa B006
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Pre-processing hook for intent classification.
Packages the input text and intent list into JSON and generates a string of conversation history.

Args:
    input (str | List | Dict | None): The input text, only string type is supported.
    history (List): Conversation history, default empty list.
    tools (List[Dict] | None): Optional tool information.
    label (str | None): Optional label.

**Returns:**

- tuple: input data dict, history list, tools, label
"""
        input_json = {}
        if isinstance(input, str):
            input_json = {'human_input': input, 'intent_list': self._intent_list}
        else:
            raise ValueError(f'Unexpected type for input: {type(input)}')

        history_info = chat_history_to_str(history)
        history = []
        input_text = json.dumps(input_json, ensure_ascii=False)
        return dict(history_info=history_info, input=input_text), history, tools, label

`post_process_result(input)`

Post-processing of intent classification result. Returns the result directly if it is in the intent list, otherwise returns the first element of the intent list.

Parameters:

input (str) –

Output result from the classification model.

Returns:

str: The final classification label.

Source code in lazyllm/tools/classifier/intent_classifier.py

    def post_process_result(self, input):
        """Post-processing of intent classification result.
Returns the result directly if it is in the intent list, otherwise returns the first element of the intent list.

Args:
    input (str): Output result from the classification model.

**Returns:**

- str: The final classification label.
"""
        input = input.strip()
        return input if input in self._intent_list else self._intent_list[0]

`lazyllm.tools.Document`

Bases: ModuleBase, BuiltinGroups

Initialize a document management module with optional embedding, storage, and user interface.

The Document module provides a unified interface for managing document datasets, including support for local files, cloud-based files, or temporary document files. It can optionally run with a document manager service or a web UI, and supports multiple embedding models and custom storage backends.

Parameters:

dataset_path (Optional[str], default: None ) –

Path to the dataset directory. If not found, the system will attempt to locate it in lazyllm.config["data_path"].
embed (Optional[Union[Callable, Dict[str, Callable]]], default: None ) –

Embedding function or mapping of embedding functions. When a dictionary is provided, keys are embedding names and values are embedding models.
create_ui (bool, default: False ) –

Whether to create the document-management UI. It requires an available DocServer and can be combined with manager=True or manager=DocServer(...).
manager (Union[bool, str, DocServer, DocumentProcessor], default: False ) –

Document manager mode. True launches a local DocServer together with a local parsing service. DocServer(...) connects an existing document-management service. DocumentProcessor(...) connects a parsing service only and requires a non-map store_conf. 'ui' is accepted as a compatibility alias for manager=True, create_ui=True.
server (Union[bool, int], default: False ) –

Whether to run a server interface for knowledge bases. True enables a default server, an integer specifies a custom port, and False disables it. Defaults to False.
name (Optional[str], default: None ) –

Name identifier for this document collection. Defaults to the system default name.
launcher (Optional[LazyLLMLaunchersBase], default: None ) –

Launcher instance for managing server processes. Defaults to a remote asynchronous launcher.
doc_files (Optional[List[str]], default: None ) –

Temporary document files. When used, dataset_path must be None. Only MapStore is supported in this mode.
doc_fields (Optional[Dict[str, GlobalMetadataDesc]], default: None ) –

Metadata field configuration for storing and retrieving document attributes.
store_conf (Optional[Dict], default: None ) –

Storage configuration. Defaults to in-memory MapStore.
display_name (Optional[str], default: '' ) –

Human-readable display name for this document module. Defaults to the collection name.
description (Optional[str], default: 'algorithm description' ) –

Description of the document collection. Defaults to "algorithm description".
schema_extractor (Optional[Union[LLMBase, SchemaExtractor]], default: None ) –

Optional schema extractor used for metadata schema analysis and registration.
enable_path_monitoring (Optional[bool], default: None ) –

Whether to watch the local dataset path for file additions and removals. Defaults to enabled only for local documents without DocServer/DocumentProcessor manager mode.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)  # or documents = Document(dataset_path='your_doc_path', embed={"key": m}, manager=False)
>>> m1 = lazyllm.TrainableModule("bge-large-zh-v1.5").start()
>>> document1 = Document(dataset_path='your_doc_path', embed={"online": m, "local": m1}, manager=False)

>>> store_conf = {
>>>     "segment_store": {
>>>         "type": "map",
>>>         "kwargs": {
>>>             "uri": "/tmp/tmp_segments.db",
>>>         },
>>>     },
>>>     "vector_store": {
>>>         "type": "milvus",
>>>         "kwargs": {
>>>             "uri": "/tmp/tmp_milvus.db",
>>>             "index_kwargs": {
>>>                 "index_type": "FLAT",
>>>                 "metric_type": "COSINE",
>>>             },
>>>         },
>>>     },
>>> }
>>> doc_fields = {
>>>     'author': DocField(data_type=DataType.VARCHAR, max_size=128, default_value=' '),
>>>     'public_year': DocField(data_type=DataType.INT32),
>>> }
>>> document2 = Document(dataset_path='your_doc_path', embed={"online": m, "local": m1}, store_conf=store_conf, doc_fields=doc_fields)

Source code in lazyllm/tools/rag/document.py

class Document(ModuleBase, BuiltinGroups, metaclass=_MetaDocument):
    """Initialize a document management module with optional embedding, storage, and user interface.

The ``Document`` module provides a unified interface for managing document datasets, including support for local files, cloud-based files, or temporary document files. It can optionally run with a document manager service or a web UI, and supports multiple embedding models and custom storage backends.

Args:
    dataset_path (Optional[str]): Path to the dataset directory. If not found, the system will attempt to locate it in ``lazyllm.config["data_path"]``.
    embed (Optional[Union[Callable, Dict[str, Callable]]]): Embedding function or mapping of embedding functions. When a dictionary is provided, keys are embedding names and values are embedding models.
    create_ui (bool, optional): Whether to create the document-management UI. It requires an available ``DocServer`` and can be combined with ``manager=True`` or ``manager=DocServer(...)``.
    manager (Union[bool, str, DocServer, DocumentProcessor], optional): Document manager mode. ``True`` launches a local ``DocServer`` together with a local parsing service. ``DocServer(...)`` connects an existing document-management service. ``DocumentProcessor(...)`` connects a parsing service only and requires a non-map ``store_conf``. ``'ui'`` is accepted as a compatibility alias for ``manager=True, create_ui=True``.
    server (Union[bool, int], optional): Whether to run a server interface for knowledge bases. ``True`` enables a default server, an integer specifies a custom port, and ``False`` disables it. Defaults to ``False``.
    name (Optional[str]): Name identifier for this document collection. Defaults to the system default name.
    launcher (Optional[Launcher]): Launcher instance for managing server processes. Defaults to a remote asynchronous launcher.
    doc_files (Optional[List[str]]): Temporary document files. When used, ``dataset_path`` must be ``None``. Only MapStore is supported in this mode.
    doc_fields (Optional[Dict[str, DocField]]): Metadata field configuration for storing and retrieving document attributes.
    store_conf (Optional[Dict]): Storage configuration. Defaults to in-memory MapStore.
    display_name (Optional[str]): Human-readable display name for this document module. Defaults to the collection name.
    description (Optional[str]): Description of the document collection. Defaults to ``"algorithm description"``.
    schema_extractor (Optional[Union[LLMBase, SchemaExtractor]]): Optional schema extractor used for metadata schema analysis and registration.
    enable_path_monitoring (Optional[bool]): Whether to watch the local dataset path for file additions and removals. Defaults to enabled only for local documents without ``DocServer``/``DocumentProcessor`` manager mode.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import Document
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)  # or documents = Document(dataset_path='your_doc_path', embed={"key": m}, manager=False)
    >>> m1 = lazyllm.TrainableModule("bge-large-zh-v1.5").start()
    >>> document1 = Document(dataset_path='your_doc_path', embed={"online": m, "local": m1}, manager=False)

    >>> store_conf = {
    >>>     "segment_store": {
    >>>         "type": "map",
    >>>         "kwargs": {
    >>>             "uri": "/tmp/tmp_segments.db",
    >>>         },
    >>>     },
    >>>     "vector_store": {
    >>>         "type": "milvus",
    >>>         "kwargs": {
    >>>             "uri": "/tmp/tmp_milvus.db",
    >>>             "index_kwargs": {
    >>>                 "index_type": "FLAT",
    >>>                 "metric_type": "COSINE",
    >>>             },
    >>>         },
    >>>     },
    >>> }
    >>> doc_fields = {
    >>>     'author': DocField(data_type=DataType.VARCHAR, max_size=128, default_value=' '),
    >>>     'public_year': DocField(data_type=DataType.INT32),
    >>> }
    >>> document2 = Document(dataset_path='your_doc_path', embed={"online": m, "local": m1}, store_conf=store_conf, doc_fields=doc_fields)
    """
    class _Manager(ModuleBase):
        @staticmethod
        def _resolve_dataset_path(dataset_path: Optional[str]) -> Optional[str]:
            if not dataset_path:
                return dataset_path
            if os.path.exists(dataset_path):
                return os.path.join(os.getcwd(), dataset_path)
            default_path = os.path.join(lazyllm.config['data_path'], dataset_path)
            return default_path if os.path.exists(default_path) else dataset_path

        @staticmethod
        def _decide_service_mode(manager, store_conf, processor, dataset_path) -> Tuple[bool, bool]:
            """Returns ``(spawn_doc_server, connect_doc_server)``."""
            if isinstance(manager, str) and manager != 'ui':
                raise ValueError(f'Unsupported manager value: {manager}')
            spawn = bool(manager) and not isinstance(manager, DocServer)
            connect = isinstance(manager, DocServer)
            if (not spawn and not connect and not processor and is_persistent_store(store_conf)
                    and dataset_path and not os.path.isfile(dataset_path)
                    and not any(iter_embedded_store_endpoints(store_conf))):
                lazyllm.LOG.info(f'Persistent store detected (type={store_conf.get("type")}),'
                                 f' auto-enabling DocServer for production-grade file tracking and scan.')
                spawn = True
            return spawn, connect

        @staticmethod
        def _reject_embedded_store_with_service_mode(store_conf, *, spawn, connect, processor):
            """Service-mode RAG + an embedded single-process backend races the subprocesses on
            shared on-disk state; force the user to point at a networked endpoint instead."""
            if not (spawn or connect or processor is not None):
                return
            embedded = list(iter_embedded_store_endpoints(store_conf))
            if not embedded:
                return
            raise ValueError(
                'Document with `manager=True` / `manager=DocServer(...)` / `manager=DocumentProcessor(...)`'
                ' does not support embedded (filesystem-bound) vector stores. Point the store config at a'
                ' remote service (http/https/tcp/grpc/unix scheme), e.g.'
                " Milvus: {'type': 'milvus', 'kwargs': {'uri': os.getenv('MILVUS_URI', 'http://<host>:19530')}}"
                " or Chroma: {'type': 'chroma', 'kwargs': {'uri': 'http://<host>:8000'}}."
                f' Offending store(s): {embedded!r}.')

        def _iter_kbs(self):
            return self._kbs._impl._m if isinstance(self._kbs, ServerModule) else self._kbs

        def __init__(self, dataset_path: Optional[str],
                     embed: Optional[Union[Callable, Dict[str, Callable]]] = None,
                     manager: Union[bool, str, DocServer] = False, server: Union[bool, int] = False,
                     name: Optional[str] = None, launcher: Optional[Launcher] = None,
                     store_conf: Optional[Dict] = None, doc_fields: Optional[Dict[str, DocField]] = None,
                     cloud: bool = False, doc_files: Optional[List[str]] = None,
                     processor: Optional[DocumentProcessor] = None, display_name: Optional[str] = '',
                     description: Optional[str] = 'algorithm description',
                     schema_extractor: Optional[Union[LLMBase, SchemaExtractor]] = None,
                     create_ui: bool = False):
            super().__init__()
            self._origin_path, self._doc_files, self._cloud = dataset_path, doc_files, cloud
            self._dataset_path = self._resolve_dataset_path(dataset_path)
            self._embed = self._get_embeds(embed)
            self._processor = processor
            self._create_ui = create_ui
            self._spawn_doc_server = False
            self._doc_processor_started = False

            spawn_doc_server, connect_doc_server = self._decide_service_mode(
                manager, store_conf, processor, self._dataset_path)
            self._reject_embedded_store_with_service_mode(
                store_conf, spawn=spawn_doc_server, connect=connect_doc_server, processor=processor)

            self._launcher: Launcher = launcher if launcher else (
                lazyllm.launchers.empty(sync=False) if spawn_doc_server else lazyllm.launchers.remote(sync=False))
            self._doc_impl_dataset_path = self._dataset_path if not (spawn_doc_server or connect_doc_server) else None
            self._doc_processor = None
            if spawn_doc_server:
                self._spawn_doc_server = True
                self._doc_processor = DocumentProcessor(launcher=self._launcher, pythonpath=_LOCAL_PYTHONPATH)
                self._submodules.remove(self._doc_processor)
            elif connect_doc_server:
                self._manager = manager
                parser_url = getattr(getattr(manager, '_raw_impl', None), '_parser_url', None) or manager.parser_url
                if parser_url:
                    self._doc_processor = DocumentProcessor(url=parser_url)
            self._schema_extractor = schema_extractor
            self._store_conf = store_conf
            self._display_name = display_name
            self._description = description
            name = name or RAG_DEFAULT_GROUP_NAME
            if not display_name: display_name = name
            doc_processor = self._doc_processor or processor
            self._kbs = CallableDict({name: DocImpl(
                embed=self._embed, dataset_path=self._doc_impl_dataset_path, doc_files=doc_files,
                global_metadata_desc=doc_fields, store=store_conf, processor=doc_processor,
                algo_name=name, display_name=display_name, description=description,
                schema_extractor=schema_extractor)})

            if create_ui and not self._spawn_doc_server:
                self.ensure_doc_web()
            if server:
                self._kbs = ServerModule(self._kbs, port=(None if isinstance(server, bool) else int(server)))
            self._global_metadata_desc = doc_fields

        @property
        def url(self):
            if hasattr(self, '_manager'): return self._manager._url
            return None

        @property
        @deprecated('Document.manager.url')
        def _url(self):
            return self.url

        @property
        def web_url(self):
            if hasattr(self, '_docweb'): return self._docweb.url
            return None

        def ensure_doc_web(self):
            if hasattr(self, '_docweb'):
                return self._docweb
            if self._spawn_doc_server and not hasattr(self, '_manager'):
                raise ValueError('`create_ui=True` with `manager=True` requires `Document.start()` before using the UI')
            if not hasattr(self, '_manager') or not isinstance(self._manager, DocServer):
                raise ValueError(
                    '`create_ui=True` requires an available DocServer. '
                    'Set `manager=True` or pass `manager=DocServer(...)`.'
                )
            self._docweb = DocWebModule(doc_server=self._manager)
            return self._docweb

        def _ensure_doc_processor_started(self):
            if self._doc_processor and not self._doc_processor_started:
                self._doc_processor.start()
                self._doc_processor_started = True

        def _ensure_managed_services_started(self):
            if self._spawn_doc_server:
                self._ensure_doc_processor_started()
                if not hasattr(self, '_manager'):
                    # Start DocServer with scanning disabled; enable only after
                    # all KBs + parser algorithms are registered so the first
                    # scan sees a consistent routing table.
                    self._manager = DocServer(
                        launcher=self._launcher,
                        storage_dir=self._dataset_path,
                        parser_url=self._doc_processor.url,
                        pythonpath=_LOCAL_PYTHONPATH,
                        enable_scan=bool(self._dataset_path),
                    )
                    self._manager.start()
                    kbs = self._iter_kbs()
                    for kb_name in kbs:
                        self._manager.ensure_kb_registered(kb_name)
                    for impl in kbs.values():
                        impl._lazy_init()
                    self._manager.enable_scanning()
                if self._create_ui and not hasattr(self, '_docweb'):
                    self.ensure_doc_web()
                    self._docweb.start()

        def _get_deploy_tasks(self):
            if self._spawn_doc_server and not hasattr(self, '_manager'):
                return lazyllm.pipeline(self._ensure_managed_services_started)
            return None

        def _get_embeds(self, embed):
            embeds = embed if isinstance(embed, dict) else {EMBED_DEFAULT_KEY: embed} if embed else {}
            for index, module in enumerate(embeds.values()):
                if isinstance(module, ModuleBase):
                    setattr(self, f'_embed_module_{index}', module)
            return embeds

        def add_kb_group(self, name, doc_fields: Optional[Dict[str, DocField]] = None,
                         store_conf: Optional[Dict] = None, embed: Optional[Union[Callable, Dict[str, Callable]]] = None,
                         schema_extractor: Optional[Union[LLMBase, SchemaExtractor]] = None):
            embed = self._get_embeds(embed) if embed else self._embed
            schema_extractor = schema_extractor or self._schema_extractor
            if isinstance(schema_extractor, ModuleBase):
                setattr(self, f'_schema_extractor_{name}', schema_extractor)
            impl = DocImpl(
                dataset_path=self._doc_impl_dataset_path, embed=embed, kb_group_name=name,
                global_metadata_desc=doc_fields,
                store=store_conf or (None if (self._doc_processor or self._processor) else self._store_conf),
                processor=self._doc_processor or self._processor,
                algo_name=name, display_name=name, description='',
                schema_extractor=schema_extractor,
            )
            self._iter_kbs()[name] = impl
            # Register KB with DocServer if it's already running so the next scan cycle picks it up.
            if hasattr(self, '_manager') and isinstance(self._manager, DocServer):
                self._manager.ensure_kb_registered(name)
                impl._lazy_init()

        def get_doc_by_kb_group(self, name):
            return self._iter_kbs()[name]

        def stop(self):
            if hasattr(self, '_docweb'):
                self._docweb.stop()
            self._launcher.cleanup()

        def __call__(self, *args, **kw):
            return self._kbs(*args, **kw)

    def __new__(cls, *args, **kw):
        if url := kw.pop('url', None):
            name = kw.pop('name', None)
            if args or kw:
                raise TypeError(
                    f"When 'url' is provided, only 'name' is allowed. "
                    f'Got args={args}, extra kwargs={kw}'
                )
            return UrlDocument(url, name)
        else:
            return super().__new__(cls)

    @staticmethod
    def _coerce_document_processor_manager(manager, store_conf, dataset_path):
        """Validate the ``manager=DocumentProcessor(...)`` combination.

        Returns ``(processor, manager)``: when ``manager`` is a ``DocumentProcessor``
        the returned ``processor`` is the original instance and ``manager`` becomes
        ``False``; otherwise ``processor`` is ``None`` and ``manager`` passes through.
        """
        if not isinstance(manager, DocumentProcessor):
            return None, manager
        if store_conf is not None:
            raise ValueError(
                '`store_conf` must not be passed to `Document` when `manager` is a DocumentProcessor; '
                'set `store_conf` on the DocumentProcessor instance instead.'
            )
        if getattr(manager, '_store_conf', None) is None:
            raise ValueError(
                '`manager=DocumentProcessor(...)` requires the DocumentProcessor to have `store_conf` set; '
                'pass `store_conf=...` when constructing the DocumentProcessor.'
            )
        if is_local_map_store(manager._store_conf):
            raise ValueError('`manager=DocumentProcessor(...)` does not support pure local map store')
        if dataset_path is not None:
            raise ValueError(
                '`manager=DocumentProcessor(...)` does not accept a local `dataset_path`: the external'
                ' parsing service does not own directory scanning / lifecycle management. Use'
                ' `manager=True` or `manager=DocServer(...)` for scan-based ingestion, or drop'
                ' `dataset_path` and upload documents via explicit API calls.')
        return manager, False

    def __init__(self, dataset_path: Optional[str] = None, embed: Optional[Union[Callable, Dict[str, Callable]]] = None,
                 create_ui: bool = False,
                 manager: Union[bool, str, DocServer, 'Document._Manager', DocumentProcessor] = False,
                 server: Union[bool, int] = False, name: Optional[str] = None,
                 launcher: Optional[Launcher] = None, doc_files: Optional[List[str]] = None,
                 doc_fields: Dict[str, DocField] = None,
                 store_conf: Optional[Dict] = None, display_name: Optional[str] = '',
                 description: Optional[str] = 'algorithm description',
                 schema_extractor: Optional[Union[LLMBase, SchemaExtractor]] = None,
                 enable_path_monitoring: Optional[bool] = None):
        super().__init__()
        if create_ui:
            warnings.warn('`create_ui=True` (and the legacy `manager="ui"` alias) is deprecated and will be removed'
                          ' in a future release. Prefer `manager=True` and interact with DocServer via its HTTP API'
                          ' / SDK instead.', DeprecationWarning, stacklevel=2)
        if isinstance(manager, str):
            if manager != 'ui': raise ValueError(f'Unsupported manager value: {manager}')
            create_ui = manager = True
        if enable_path_monitoring is not None:
            warnings.warn('`enable_path_monitoring` is deprecated: DocImpl no longer polls the dataset '
                          'directory. Persistent-store setups auto-upgrade to DocServer which owns scanning; '
                          'map-store setups get a one-time ingest at `_lazy_init`. The parameter is accepted '
                          'for backward compatibility but has no effect.', DeprecationWarning, stacklevel=2)
        if isinstance(dataset_path, (tuple, list)):
            doc_fields = dataset_path
            dataset_path = None
        if doc_files is not None:
            assert dataset_path is None and not manager, (
                'Manager and dataset_path are not supported for Document with temp-files')
            assert store_conf is None or store_conf['type'] == 'map', (
                'Only map store is supported for Document with temp-files')

        name = name or RAG_DEFAULT_GROUP_NAME

        if isinstance(manager, Document._Manager):
            assert not server, 'Server information is already set by manager'
            assert not launcher, 'Launcher information is already set by manager'
            assert not manager._cloud, 'manager is not allowed to share in cloud mode'
            assert manager._doc_files is None, 'manager is not allowed to share with temp files'
            if dataset_path != manager._dataset_path and dataset_path != manager._origin_path:
                raise RuntimeError(f'Document path mismatch, expected `{manager._dataset_path}`'
                                   f'while received `{dataset_path}`')
            manager.add_kb_group(name=name, doc_fields=doc_fields, store_conf=store_conf, embed=embed,
                                 schema_extractor=schema_extractor)
            if create_ui:
                manager.ensure_doc_web()
            self._manager = manager
            self._curr_group = name
        else:
            processor, manager = self._coerce_document_processor_manager(manager, store_conf, dataset_path)
            cloud = processor is not None
            self._manager = Document._Manager(dataset_path, embed, manager, server, name, launcher, store_conf,
                                              doc_fields, cloud=cloud, doc_files=doc_files, processor=processor,
                                              display_name=display_name, description=description,
                                              schema_extractor=schema_extractor, create_ui=create_ui)
            self._curr_group = name
        self._graph_document: weakref.ref = None

    @staticmethod
    def list_all_files_in_directory(dataset_path: str, skip_hidden_path: bool = True,
                                    recursive: bool = True) -> List[str]:
        """List all files in a given directory path.

This method recursively or non-recursively traverses a directory and collects all file paths. It can optionally skip hidden files and directories (those starting with '.'). If the provided path is a file instead of a directory, it returns a list containing only that file path.

Args:
    dataset_path (str): The path to the directory to list.
    skip_hidden_path (bool, optional): Whether to skip hidden files and directories (those starting with '.'). Defaults to True.
    recursive (bool, optional): Whether to recursively search subdirectories. If False, only files in the immediate directory are returned. Defaults to True.

**Returns:**

- List[str]: A list of absolute file paths. Returns an empty list if the path does not exist or is not a directory.
"""
        if not os.path.exists(dataset_path):
            return []
        if not os.path.isdir(dataset_path):
            return [dataset_path] if os.path.isfile(dataset_path) else []
        files_list = []
        if recursive:
            for root, dirs, files in os.walk(os.path.abspath(dataset_path)):
                if skip_hidden_path:
                    if any(part.startswith('.') for part in root.split(os.sep) if part):
                        continue
                    dirs[:] = [d for d in dirs if not d.startswith('.')]
                    files = [f for f in files if not f.startswith('.')]
                files_list.extend(os.path.join(root, f) for f in files)
        else:
            for item in os.listdir(dataset_path):
                if skip_hidden_path and item.startswith('.'):
                    continue
                item_path = os.path.join(dataset_path, item)
                if os.path.isfile(item_path):
                    files_list.append(item_path)
        return files_list

    def _list_all_files_in_dataset(self, skip_hidden_path: bool = True) -> List[str]:
        return self.list_all_files_in_directory(self._manager._dataset_path, skip_hidden_path)

    @property
    def url(self):
        assert isinstance(self._manager._kbs, ServerModule), 'Document is not a service, please set `manager` to `True`'
        return self._manager._kbs._url

    @deprecated('Use SchemaExtractor directly')
    def connect_sql_manager(self, sql_manager: SqlManager, schma=None,
                            force_refresh: bool = True):
        """.. deprecated:: Use SchemaExtractor directly.

This method is removed. Use ``SchemaExtractor`` with ``register_schema_set`` instead.
"""
        raise NotImplementedError(
            'connect_sql_manager is removed. Use SchemaExtractor with register_schema_set instead.'
        )

    def get_sql_manager(self):
        """Get the NL2SQL manager instance from the SchemaExtractor bound to this document module. Can be used to construct a SqlCall.

**Returns:**\\n
- SqlManager: The SQL manager instance.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        return ext.sql_manager_for_nl2sql()

    def extract_db_schema(
        self, llm: Union[OnlineChatModule, TrainableModule] = None, print_schema: bool = False
    ):
        """Extract the database schema from the dataset using the SchemaExtractor's LLM and register it.

Args:
    llm (Union[OnlineChatModule, TrainableModule], optional): LLM for schema analysis; defaults to the SchemaExtractor's built-in LLM.
    print_schema (bool, optional): Whether to log the extracted schema. Defaults to ``False``.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        file_paths = self._list_all_files_in_dataset()
        result = ext.analyze_schema_and_register(data=file_paths)
        if print_schema:
            lazyllm.LOG.info(f'Extracted Schema:\n\t{result}\n')
        return result

    def update_database(self, llm: Union[OnlineChatModule, TrainableModule] = None):
        """Update the database with information extracted from documents using the SchemaExtractor.

Args:
    llm (Union[OnlineChatModule, TrainableModule], optional): LLM for extraction; defaults to the SchemaExtractor's built-in LLM.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        file_paths = self._list_all_files_in_dataset()
        for fp in file_paths:
            ext.extract_and_store(data=fp)

    @deprecated('Document(dataset_path, manager=doc.manager, name=xx, doc_fields=xx, store_conf=xx)')
    def create_kb_group(self, name: str, doc_fields: Optional[Dict[str, DocField]] = None,
                        store_conf: Optional[Dict] = None) -> 'Document':
        """Create a new knowledge base group (KB Group) and return a document object bound to that group.

Knowledge base groups are used to partition different document collections within the same document module. Each group can have independent field definitions and storage configurations.

Args:
    name (str): Name of the knowledge base group.
    doc_fields (Optional[Dict[str, DocField]]): Document field definitions, specifying field names, types, and descriptions.
    store_conf (Optional[Dict]): Storage configuration, defining the backend and its parameters.

**Returns:**

- Document: A copy of the document object bound to the newly created knowledge base group.
"""
        self._manager.add_kb_group(name=name, doc_fields=doc_fields, store_conf=store_conf)
        doc = copy.copy(self)
        doc._curr_group = name
        return doc

    @property
    @deprecated('Document._manager')
    def _impls(self): return self._manager

    @property
    def _impl(self) -> DocImpl: return self._manager.get_doc_by_kb_group(self._curr_group)

    @property
    def _schema_extractor(self):
        # Compat shim: read through the active DocImpl so shared-manager KBs keep per-group values.
        # read through the active DocImpl so shared-manager KBs keep per-group values.
        impl = self._manager.get_doc_by_kb_group(self._curr_group)
        return getattr(impl, '_schema_extractor', None)

    @property
    def manager(self): return self._manager._processor or self._manager

    def activate_group(self, group_name: str, embed_keys: Optional[Union[str, List[str]]] = None,
                       enable_embed: bool = True):
        """Activate the specified knowledge base group, optionally enabling specific embedding keys.

After activation, the document module will perform retrieval and storage operations within the given group. If no embedding keys are provided, all available embeddings will be enabled by default.

Args:
    group_name (str): Name of the knowledge base group to activate.
    embed_keys (Optional[Union[str, List[str]]]): Embedding keys to enable, either as a string or a list of strings. Defaults to an empty list, enabling all embeddings.
"""
        if embed_keys and not enable_embed:
            raise ValueError('`enable_embed` must be set to True when `embed_keys` is provided')
        # if embed_keys is None, use default embed keys
        if (enable_embed and not embed_keys) and self._manager._embed:
            embed_keys = self._manager._embed.keys()
        if isinstance(embed_keys, str): embed_keys = [embed_keys]
        self._impl.activate_group(group_name, embed_keys, enable_embed)

    def activate_groups(self, groups: Union[str, List[str]], **kwargs):
        """Activate multiple knowledge base groups in batch.

This method iteratively calls `activate_group` to activate all the provided groups.

Args:
    groups (Union[str, List[str]]): A single group name or a list of group names to activate.
"""
        if isinstance(groups, str): groups = [groups]
        for group in groups:
            self.activate_group(group, **kwargs)

    @DynamicDescriptor
    def create_node_group(self, name: str = None, *, transform: Callable, parent: str = LAZY_ROOT_NAME,
                          trans_node: bool = None, num_workers: int = 0, display_name: str = None,
                          ref: str = None, group_type: NodeGroupType = NodeGroupType.CHUNK,
                          lazy_mode: str = None, **kwargs) -> None:
        """
Generate a node group produced by the specified rule.

Args:
    name (str): The name of the node group.
    transform (Callable): The transformation rule that converts a node into a node group. The function prototype is `(DocNode, group_name, **kwargs) -> List[DocNode]`. Currently built-in options include [SentenceSplitter][lazyllm.tools.SentenceSplitter], and users can define their own transformation rules.
    trans_node (bool): Determines whether the input and output of transform are `DocNode` or `str`, default is None. Can only be set to true when `transform` is `Callable`.
    num_workers (int): number of new threads used for transform. default: 0
    parent (str): The node that needs further transformation. The series of new nodes obtained after transformation will be child nodes of this parent node. If not specified, the transformation starts from the root node.
    ref (str): The name of another node group to reference. The referenced node group must be a descendant of the parent. During transformation, nodes from the referenced node group are passed to the transform function as the `ref` parameter (if the transform function supports it).
    kwargs: Parameters related to the specific implementation.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="sentences", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> # Example with ref parameter: create a node group that references another group
    >>> documents.create_node_group(name="fine_chunks", parent="sentences",
    ...                             transform=SentenceSplitter, chunk_size=128, chunk_overlap=12)
    >>> def transform_with_ref(text, ref):
    ...     # ref contains nodes from the referenced group
    ...     return "
    ".join(ref)
    >>> documents.create_node_group(name="summary_chunks", parent="sentences",
    ...                             transform=transform_with_ref, ref="fine_chunks")
    """
        assert ref is None or parent != ref, 'parent and ref must be different'
        if isinstance(self, type):
            DocImpl.create_global_node_group(name, transform=transform, parent=parent, trans_node=trans_node,
                                             num_workers=num_workers, display_name=display_name,
                                             group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)
        else:
            self._impl.create_node_group(name, transform=transform, parent=parent, trans_node=trans_node,
                                         num_workers=num_workers, display_name=display_name, group_type=group_type,
                                         ref=ref, lazy_mode=lazy_mode, **kwargs)

    @DynamicDescriptor
    def add_reader(self, pattern: str, func: Optional[Callable] = None):
        """
Used to specify the file reader for an instance. The scope of action is visible only to the registered Document object. The registered file reader must be a Callable object. It can only be registered by calling a function. The priority of the file reader registered by the instance is higher than that of the file reader registered by the class, and the priority of the file reader registered by the instance and class is higher than the system default file reader. That is, the order of priority is: instance file reader > class file reader > system default file reader.

Args:
    pattern (str): Matching rules applied by the file reader.
    func (Callable): File reader, must be a Callable object.


Examples:

    >>> from lazyllm.tools.rag import Document, DocNode
    >>> from lazyllm.tools.rag.readers import ReaderBase
    >>> class YmlReader(ReaderBase):
    ...     def _load_data(self, file, fs=None):
    ...         try:
    ...             import yaml
    ...         except ImportError:
    ...             raise ImportError("yaml is required to read YAML file: `pip install pyyaml`")
    ...         with open(file, 'r') as f:
    ...             data = yaml.safe_load(f)
    ...         print("Call the class YmlReader.")
    ...         return [DocNode(text=data)]
    ...
    >>> def processYml(file):
    ...     with open(file, 'r') as f:
    ...         data = f.read()
    ...     print("Call the function processYml.")
    ...     return [DocNode(text=data)]
    ...
    >>> doc1 = Document(dataset_path="your_files_path")
    >>> doc2 = Document(dataset_path="your_files_path")
    >>> doc1.add_reader("**/*.yml", YmlReader)
    >>> print(doc1._impl._local_file_reader)
    {'**/*.yml': <class '__main__.YmlReader'>}
    >>> print(doc2._impl._local_file_reader)
    {}
    >>> files = ["your_yml_files"]
    >>> Document.register_global_reader("**/*.yml", processYml)
    >>> doc1._impl._reader.load_data(input_files=files)
    Call the class YmlReader.
    >>> doc2._impl._reader.load_data(input_files=files)
    Call the function processYml.
    """
        if isinstance(self, type):
            return DocImpl.register_global_reader(pattern=pattern, func=func)
        else:
            self._impl.add_reader(pattern, func)

    @classmethod
    def register_global_reader(cls, pattern: str, func: Optional[Callable] = None):
        """
Used to specify a file reader, which is visible to all Document objects. The registered file reader must be a Callable object. It can be registered using a decorator or by a function call.

Args:
    pattern (str): Matching rules applied by the file reader.
    func (Callable): File reader, must be a Callable object.


Examples:

    >>> from lazyllm.tools.rag import Document, DocNode
    >>> @Document.register_global_reader("**/*.yml")
    >>> def processYml(file):
    ...     with open(file, 'r') as f:
    ...         data = f.read()
    ...     return [DocNode(text=data)]
    ...
    >>> doc1 = Document(dataset_path="your_files_path")
    >>> doc2 = Document(dataset_path="your_files_path")
    >>> files = ["your_yml_files"]
    >>> docs1 = doc1._impl._reader.load_data(input_files=files)
    >>> docs2 = doc2._impl._reader.load_data(input_files=files)
    >>> print(docs1[0].text == docs2[0].text)
    # True
    """
        return cls.add_reader(pattern, func)

    def get_store(self):
        """Get the storage placeholder object.

This method returns a placeholder for the storage layer, allowing deferred binding of the actual storage implementation.
The caller can use this object for storage-related configuration or extension.

**Returns:**

- StorePlaceholder: Storage placeholder object.
"""
        return StorePlaceholder()

    def get_embed(self):
        """Get the embedding placeholder object.

This method returns a placeholder for the embedding layer, allowing deferred binding of the actual embedding implementation.
The caller can use this object for embedding-related configuration or extension.

**Returns:**

- EmbedPlaceholder: Embedding placeholder object.
"""
        return EmbedPlaceholder()

    def register_index(self, index_type: str, index_cls: IndexBase, *args, **kwargs) -> None:
        """Register a new index type.

This method allows users to register a new index type for the document module, enabling extension of retrieval capabilities.
Once registered, the index can be referenced by its type.

Args:
    index_type (str): Name of the index type.
    index_cls (IndexBase): Index class, must inherit from ``IndexBase``.
    *args: Variable arguments for index initialization.
    **kwargs: Keyword arguments for index initialization.
"""
        self._impl.register_index(index_type, index_cls, *args, **kwargs)

    def _forward(self, func_name: str, *args, **kw):
        return self._manager(self._curr_group, func_name, *args, **kw)

    def start(self):
        return super().start()

    def find_parent(self, target) -> Callable:
        """Find the parent node of the target.

This method returns a callable object that performs a deferred parent lookup operation.
It invokes the underlying implementation to retrieve the parent node of the specified target.

Args:
    target: The target for which to find the parent.

**Returns:**

- Callable: Callable object for performing the parent lookup.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
    >>> documents.find_parent('children')
    """
        return functools.partial(self._forward, 'find_parent', group=target)

    def find_children(self, target) -> Callable:
        """Find the children nodes of the target.

This method returns a callable object that performs a deferred children lookup operation.
It invokes the underlying implementation to retrieve all children nodes of the specified target.

Args:
    target: The target for which to find the children.

**Returns:**

- Callable: Callable object for performing the children lookup.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
    >>> documents.find_children('parent')
    """
        return functools.partial(self._forward, 'find_children', group=target)

    def find(self, target) -> Callable:
        """Find the target.

This method returns a callable object that performs a deferred lookup operation.
It invokes the underlying implementation to retrieve the specified target.

Args:
    target: The target to be found.

**Returns:**

- Callable: Callable object for performing the target lookup.
"""
        return functools.partial(self._forward, 'find', group=target)

    def forward(self, *args, **kw) -> List[DocNode]:
        return self._forward('retrieve', *args, **kw)

    def clear_cache(self, group_names: Optional[List[str]] = None) -> None:
        """Clear cache.

This method clears the cache of the document module. A list of group names can be specified to
clear cache for specific groups. If no group names are provided, all group caches will be cleared.

Args:
    group_names (Optional[List[str]]): List of group names whose cache should be cleared.
        Defaults to ``None``, meaning clear all caches.
"""
        return self._forward('clear_cache', group_names)

    def drop_algorithm(self):
        """
Delete the algorithm information registered in the document parsing service for the current document collection.
"""
        return self._forward('drop_algorithm')

    def analyze_schema_by_llm(self, kb_id: Optional[str] = None, doc_ids: Optional[List[str]] = None):
        """
Use an LLM to auto-infer a field schema for a specific knowledge base or document set in the Document manager, returning a generated Pydantic model. Supports narrowing the sample by kb_id and a list of doc_ids.

Args:
    kb_id: Target knowledge base id.
    doc_ids: List of target document ids.
"""
        return self._forward('_analyze_schema_by_llm', kb_id, doc_ids)

    def register_schema_set(self, schema_set: Type[BaseModel], kb_id: Optional[str] = DEFAULT_KB_ID,
                            force_refresh: bool = False) -> str:
        """
Manually register a Pydantic model as the schema for the current algorithm and bind it to a specific knowledge base.
If the KB is already bound to another schema, it raises by default; set ``force_refresh=True`` to replace the binding and clean old records.

Args:
    schema_set (Type[BaseModel]): Pydantic model that defines the schema to register.
    kb_id (Optional[str]): Target knowledge base ID. Defaults to ``DEFAULT_KB_ID``.
    force_refresh (bool): Whether to force refresh when a binding already exists. Defaults to ``False``.

Returns:
    str: The generated ``schema_set_id``.
"""
        return self._forward('_register_schema_set', schema_set, kb_id, force_refresh)

    def get_nodes(self, uids: Optional[List[str]] = None, doc_ids: Optional[Set] = None,
                  group: Optional[str] = None, kb_id: Optional[str] = None, numbers: Optional[Set] = None,
                  limit: Optional[int] = None, offset: int = 0, return_total: bool = False,
                  sort_by_number: bool = False) -> Union[List[DocNode], Tuple[List[DocNode], int]]:
        """Get nodes by criteria.

Args:
    uids (Optional[List[str]]): List of node uids to fetch.
    doc_ids (Optional[Set]): Set of document ids to filter by.
    group (Optional[str]): Node group name.
    kb_id (Optional[str]): Knowledge base id.
    numbers (Optional[Set]): Set of node numbers.

**Returns:**

- List[DocNode]: Matched nodes.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import Document
    >>> doc = Document()
    >>> nodes = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={1, 2})
    """
        return self._forward(
            '_get_nodes', uids, doc_ids, group, kb_id, numbers, limit, offset, return_total, sort_by_number,
        )

    def get_window_nodes(self, node: DocNode, span: tuple[int, int] = (-5, 5),
                         merge: bool = False) -> Union[List[DocNode], DocNode]:
        """Get window nodes around a target node within the same document.

Args:
    node (DocNode): Target node.
    span (tuple[int, int]): Window range based on relative offsets of node.number.
    merge (bool): Whether to merge window nodes into a single node.

**Returns:**

- Union[List[DocNode], DocNode]: Window nodes list or a merged node.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import Document
    >>> doc = Document()
    >>> node = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={10})[0]
    >>> window_nodes = doc.get_window_nodes(node, span=(-2, 2), merge=False)
    """
        return self._forward('_get_window_nodes', node, span, merge)

    def _get_post_process_tasks(self):
        return lazyllm.pipeline(lambda *a: self._forward('_lazy_init'))

    def __repr__(self):
        return lazyllm.make_repr('Module', 'Document', manager=hasattr(self._manager, '_manager'),
                                 server=isinstance(self._manager._kbs, ServerModule))

`activate_group(group_name, embed_keys=None, enable_embed=True)`

Activate the specified knowledge base group, optionally enabling specific embedding keys.

After activation, the document module will perform retrieval and storage operations within the given group. If no embedding keys are provided, all available embeddings will be enabled by default.

Parameters:

group_name (str) –

Name of the knowledge base group to activate.
embed_keys (Optional[Union[str, List[str]]], default: None ) –

Embedding keys to enable, either as a string or a list of strings. Defaults to an empty list, enabling all embeddings.

Source code in lazyllm/tools/rag/document.py

    def activate_group(self, group_name: str, embed_keys: Optional[Union[str, List[str]]] = None,
                       enable_embed: bool = True):
        """Activate the specified knowledge base group, optionally enabling specific embedding keys.

After activation, the document module will perform retrieval and storage operations within the given group. If no embedding keys are provided, all available embeddings will be enabled by default.

Args:
    group_name (str): Name of the knowledge base group to activate.
    embed_keys (Optional[Union[str, List[str]]]): Embedding keys to enable, either as a string or a list of strings. Defaults to an empty list, enabling all embeddings.
"""
        if embed_keys and not enable_embed:
            raise ValueError('`enable_embed` must be set to True when `embed_keys` is provided')
        # if embed_keys is None, use default embed keys
        if (enable_embed and not embed_keys) and self._manager._embed:
            embed_keys = self._manager._embed.keys()
        if isinstance(embed_keys, str): embed_keys = [embed_keys]
        self._impl.activate_group(group_name, embed_keys, enable_embed)

`activate_groups(groups, **kwargs)`

Activate multiple knowledge base groups in batch.

This method iteratively calls activate_group to activate all the provided groups.

Parameters:

groups (Union[str, List[str]]) –

A single group name or a list of group names to activate.

Source code in lazyllm/tools/rag/document.py

    def activate_groups(self, groups: Union[str, List[str]], **kwargs):
        """Activate multiple knowledge base groups in batch.

This method iteratively calls `activate_group` to activate all the provided groups.

Args:
    groups (Union[str, List[str]]): A single group name or a list of group names to activate.
"""
        if isinstance(groups, str): groups = [groups]
        for group in groups:
            self.activate_group(group, **kwargs)

`add_reader(pattern, func=None)`

Used to specify the file reader for an instance. The scope of action is visible only to the registered Document object. The registered file reader must be a Callable object. It can only be registered by calling a function. The priority of the file reader registered by the instance is higher than that of the file reader registered by the class, and the priority of the file reader registered by the instance and class is higher than the system default file reader. That is, the order of priority is: instance file reader > class file reader > system default file reader.

Parameters:

pattern (str) –

Matching rules applied by the file reader.
func (Callable, default: None ) –

File reader, must be a Callable object.

Examples:

>>> from lazyllm.tools.rag import Document, DocNode
>>> from lazyllm.tools.rag.readers import ReaderBase
>>> class YmlReader(ReaderBase):
...     def _load_data(self, file, fs=None):
...         try:
...             import yaml
...         except ImportError:
...             raise ImportError("yaml is required to read YAML file: `pip install pyyaml`")
...         with open(file, 'r') as f:
...             data = yaml.safe_load(f)
...         print("Call the class YmlReader.")
...         return [DocNode(text=data)]
...
>>> def processYml(file):
...     with open(file, 'r') as f:
...         data = f.read()
...     print("Call the function processYml.")
...     return [DocNode(text=data)]
...
>>> doc1 = Document(dataset_path="your_files_path")
>>> doc2 = Document(dataset_path="your_files_path")
>>> doc1.add_reader("**/*.yml", YmlReader)
>>> print(doc1._impl._local_file_reader)
{'**/*.yml': <class '__main__.YmlReader'>}
>>> print(doc2._impl._local_file_reader)
{}
>>> files = ["your_yml_files"]
>>> Document.register_global_reader("**/*.yml", processYml)
>>> doc1._impl._reader.load_data(input_files=files)
Call the class YmlReader.
>>> doc2._impl._reader.load_data(input_files=files)
Call the function processYml.

Source code in lazyllm/tools/rag/document.py

    @DynamicDescriptor
    def add_reader(self, pattern: str, func: Optional[Callable] = None):
        """
Used to specify the file reader for an instance. The scope of action is visible only to the registered Document object. The registered file reader must be a Callable object. It can only be registered by calling a function. The priority of the file reader registered by the instance is higher than that of the file reader registered by the class, and the priority of the file reader registered by the instance and class is higher than the system default file reader. That is, the order of priority is: instance file reader > class file reader > system default file reader.

Args:
    pattern (str): Matching rules applied by the file reader.
    func (Callable): File reader, must be a Callable object.


Examples:

    >>> from lazyllm.tools.rag import Document, DocNode
    >>> from lazyllm.tools.rag.readers import ReaderBase
    >>> class YmlReader(ReaderBase):
    ...     def _load_data(self, file, fs=None):
    ...         try:
    ...             import yaml
    ...         except ImportError:
    ...             raise ImportError("yaml is required to read YAML file: `pip install pyyaml`")
    ...         with open(file, 'r') as f:
    ...             data = yaml.safe_load(f)
    ...         print("Call the class YmlReader.")
    ...         return [DocNode(text=data)]
    ...
    >>> def processYml(file):
    ...     with open(file, 'r') as f:
    ...         data = f.read()
    ...     print("Call the function processYml.")
    ...     return [DocNode(text=data)]
    ...
    >>> doc1 = Document(dataset_path="your_files_path")
    >>> doc2 = Document(dataset_path="your_files_path")
    >>> doc1.add_reader("**/*.yml", YmlReader)
    >>> print(doc1._impl._local_file_reader)
    {'**/*.yml': <class '__main__.YmlReader'>}
    >>> print(doc2._impl._local_file_reader)
    {}
    >>> files = ["your_yml_files"]
    >>> Document.register_global_reader("**/*.yml", processYml)
    >>> doc1._impl._reader.load_data(input_files=files)
    Call the class YmlReader.
    >>> doc2._impl._reader.load_data(input_files=files)
    Call the function processYml.
    """
        if isinstance(self, type):
            return DocImpl.register_global_reader(pattern=pattern, func=func)
        else:
            self._impl.add_reader(pattern, func)

`analyze_schema_by_llm(kb_id=None, doc_ids=None)`

Use an LLM to auto-infer a field schema for a specific knowledge base or document set in the Document manager, returning a generated Pydantic model. Supports narrowing the sample by kb_id and a list of doc_ids.

Parameters:

kb_id (Optional[str], default: None ) –

Target knowledge base id.
doc_ids (Optional[List[str]], default: None ) –

List of target document ids.

Source code in lazyllm/tools/rag/document.py

    def analyze_schema_by_llm(self, kb_id: Optional[str] = None, doc_ids: Optional[List[str]] = None):
        """
Use an LLM to auto-infer a field schema for a specific knowledge base or document set in the Document manager, returning a generated Pydantic model. Supports narrowing the sample by kb_id and a list of doc_ids.

Args:
    kb_id: Target knowledge base id.
    doc_ids: List of target document ids.
"""
        return self._forward('_analyze_schema_by_llm', kb_id, doc_ids)

`clear_cache(group_names=None)`

Clear cache.

This method clears the cache of the document module. A list of group names can be specified to clear cache for specific groups. If no group names are provided, all group caches will be cleared.

Parameters:

group_names (Optional[List[str]], default: None ) –

List of group names whose cache should be cleared. Defaults to None, meaning clear all caches.

Source code in lazyllm/tools/rag/document.py

    def clear_cache(self, group_names: Optional[List[str]] = None) -> None:
        """Clear cache.

This method clears the cache of the document module. A list of group names can be specified to
clear cache for specific groups. If no group names are provided, all group caches will be cleared.

Args:
    group_names (Optional[List[str]]): List of group names whose cache should be cleared.
        Defaults to ``None``, meaning clear all caches.
"""
        return self._forward('clear_cache', group_names)

`connect_sql_manager(sql_manager, schma=None, force_refresh=True)`

.. deprecated:: Use SchemaExtractor directly.

This method is removed. Use SchemaExtractor with register_schema_set instead.

Source code in lazyllm/tools/rag/document.py

    @deprecated('Use SchemaExtractor directly')
    def connect_sql_manager(self, sql_manager: SqlManager, schma=None,
                            force_refresh: bool = True):
        """.. deprecated:: Use SchemaExtractor directly.

This method is removed. Use ``SchemaExtractor`` with ``register_schema_set`` instead.
"""
        raise NotImplementedError(
            'connect_sql_manager is removed. Use SchemaExtractor with register_schema_set instead.'
        )

`create_kb_group(name, doc_fields=None, store_conf=None)`

Create a new knowledge base group (KB Group) and return a document object bound to that group.

Knowledge base groups are used to partition different document collections within the same document module. Each group can have independent field definitions and storage configurations.

Parameters:

name (str) –

Name of the knowledge base group.
doc_fields (Optional[Dict[str, GlobalMetadataDesc]], default: None ) –

Document field definitions, specifying field names, types, and descriptions.
store_conf (Optional[Dict], default: None ) –

Storage configuration, defining the backend and its parameters.

Returns:

Document: A copy of the document object bound to the newly created knowledge base group.

Source code in lazyllm/tools/rag/document.py

    @deprecated('Document(dataset_path, manager=doc.manager, name=xx, doc_fields=xx, store_conf=xx)')
    def create_kb_group(self, name: str, doc_fields: Optional[Dict[str, DocField]] = None,
                        store_conf: Optional[Dict] = None) -> 'Document':
        """Create a new knowledge base group (KB Group) and return a document object bound to that group.

Knowledge base groups are used to partition different document collections within the same document module. Each group can have independent field definitions and storage configurations.

Args:
    name (str): Name of the knowledge base group.
    doc_fields (Optional[Dict[str, DocField]]): Document field definitions, specifying field names, types, and descriptions.
    store_conf (Optional[Dict]): Storage configuration, defining the backend and its parameters.

**Returns:**

- Document: A copy of the document object bound to the newly created knowledge base group.
"""
        self._manager.add_kb_group(name=name, doc_fields=doc_fields, store_conf=store_conf)
        doc = copy.copy(self)
        doc._curr_group = name
        return doc

`create_node_group(name=None, *, transform, parent=LAZY_ROOT_NAME, trans_node=None, num_workers=0, display_name=None, ref=None, group_type=NodeGroupType.CHUNK, lazy_mode=None, **kwargs)`

Generate a node group produced by the specified rule.

Parameters:

name (str, default: None ) –

The name of the node group.
transform (Callable) –

The transformation rule that converts a node into a node group. The function prototype is (DocNode, group_name, **kwargs) -> List[DocNode]. Currently built-in options include SentenceSplitter, and users can define their own transformation rules.
trans_node (bool, default: None ) –

Determines whether the input and output of transform are DocNode or str, default is None. Can only be set to true when transform is Callable.
num_workers (int, default: 0 ) –

number of new threads used for transform. default: 0
parent (str, default: LAZY_ROOT_NAME ) –

The node that needs further transformation. The series of new nodes obtained after transformation will be child nodes of this parent node. If not specified, the transformation starts from the root node.
ref (str, default: None ) –

The name of another node group to reference. The referenced node group must be a descendant of the parent. During transformation, nodes from the referenced node group are passed to the transform function as the ref parameter (if the transform function supports it).
kwargs –

Parameters related to the specific implementation.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, SentenceSplitter
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="sentences", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
>>> # Example with ref parameter: create a node group that references another group
>>> documents.create_node_group(name="fine_chunks", parent="sentences",
...                             transform=SentenceSplitter, chunk_size=128, chunk_overlap=12)
>>> def transform_with_ref(text, ref):
...     # ref contains nodes from the referenced group
...     return "
".join(ref)
>>> documents.create_node_group(name="summary_chunks", parent="sentences",
...                             transform=transform_with_ref, ref="fine_chunks")

Source code in lazyllm/tools/rag/document.py

    @DynamicDescriptor
    def create_node_group(self, name: str = None, *, transform: Callable, parent: str = LAZY_ROOT_NAME,
                          trans_node: bool = None, num_workers: int = 0, display_name: str = None,
                          ref: str = None, group_type: NodeGroupType = NodeGroupType.CHUNK,
                          lazy_mode: str = None, **kwargs) -> None:
        """
Generate a node group produced by the specified rule.

Args:
    name (str): The name of the node group.
    transform (Callable): The transformation rule that converts a node into a node group. The function prototype is `(DocNode, group_name, **kwargs) -> List[DocNode]`. Currently built-in options include [SentenceSplitter][lazyllm.tools.SentenceSplitter], and users can define their own transformation rules.
    trans_node (bool): Determines whether the input and output of transform are `DocNode` or `str`, default is None. Can only be set to true when `transform` is `Callable`.
    num_workers (int): number of new threads used for transform. default: 0
    parent (str): The node that needs further transformation. The series of new nodes obtained after transformation will be child nodes of this parent node. If not specified, the transformation starts from the root node.
    ref (str): The name of another node group to reference. The referenced node group must be a descendant of the parent. During transformation, nodes from the referenced node group are passed to the transform function as the `ref` parameter (if the transform function supports it).
    kwargs: Parameters related to the specific implementation.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="sentences", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> # Example with ref parameter: create a node group that references another group
    >>> documents.create_node_group(name="fine_chunks", parent="sentences",
    ...                             transform=SentenceSplitter, chunk_size=128, chunk_overlap=12)
    >>> def transform_with_ref(text, ref):
    ...     # ref contains nodes from the referenced group
    ...     return "
    ".join(ref)
    >>> documents.create_node_group(name="summary_chunks", parent="sentences",
    ...                             transform=transform_with_ref, ref="fine_chunks")
    """
        assert ref is None or parent != ref, 'parent and ref must be different'
        if isinstance(self, type):
            DocImpl.create_global_node_group(name, transform=transform, parent=parent, trans_node=trans_node,
                                             num_workers=num_workers, display_name=display_name,
                                             group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)
        else:
            self._impl.create_node_group(name, transform=transform, parent=parent, trans_node=trans_node,
                                         num_workers=num_workers, display_name=display_name, group_type=group_type,
                                         ref=ref, lazy_mode=lazy_mode, **kwargs)

`drop_algorithm()`

Delete the algorithm information registered in the document parsing service for the current document collection.

Source code in lazyllm/tools/rag/document.py

    def drop_algorithm(self):
        """
Delete the algorithm information registered in the document parsing service for the current document collection.
"""
        return self._forward('drop_algorithm')

`extract_db_schema(llm=None, print_schema=False)`

Extract the database schema from the dataset using the SchemaExtractor's LLM and register it.

Parameters:

llm (Union[OnlineChatModule, TrainableModule], default: None ) –

LLM for schema analysis; defaults to the SchemaExtractor's built-in LLM.
print_schema (bool, default: False ) –

Whether to log the extracted schema. Defaults to False.

Source code in lazyllm/tools/rag/document.py

    def extract_db_schema(
        self, llm: Union[OnlineChatModule, TrainableModule] = None, print_schema: bool = False
    ):
        """Extract the database schema from the dataset using the SchemaExtractor's LLM and register it.

Args:
    llm (Union[OnlineChatModule, TrainableModule], optional): LLM for schema analysis; defaults to the SchemaExtractor's built-in LLM.
    print_schema (bool, optional): Whether to log the extracted schema. Defaults to ``False``.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        file_paths = self._list_all_files_in_dataset()
        result = ext.analyze_schema_and_register(data=file_paths)
        if print_schema:
            lazyllm.LOG.info(f'Extracted Schema:\n\t{result}\n')
        return result

`find(target)`

Find the target.

This method returns a callable object that performs a deferred lookup operation. It invokes the underlying implementation to retrieve the specified target.

Parameters:

target –

The target to be found.

Returns:

Callable: Callable object for performing the target lookup.

Source code in lazyllm/tools/rag/document.py

    def find(self, target) -> Callable:
        """Find the target.

This method returns a callable object that performs a deferred lookup operation.
It invokes the underlying implementation to retrieve the specified target.

Args:
    target: The target to be found.

**Returns:**

- Callable: Callable object for performing the target lookup.
"""
        return functools.partial(self._forward, 'find', group=target)

`find_children(target)`

Find the children nodes of the target.

This method returns a callable object that performs a deferred children lookup operation. It invokes the underlying implementation to retrieve all children nodes of the specified target.

Parameters:

target –

The target for which to find the children.

Returns:

Callable: Callable object for performing the children lookup.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, SentenceSplitter
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
>>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
>>> documents.find_children('parent')

Source code in lazyllm/tools/rag/document.py

    def find_children(self, target) -> Callable:
        """Find the children nodes of the target.

This method returns a callable object that performs a deferred children lookup operation.
It invokes the underlying implementation to retrieve all children nodes of the specified target.

Args:
    target: The target for which to find the children.

**Returns:**

- Callable: Callable object for performing the children lookup.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
    >>> documents.find_children('parent')
    """
        return functools.partial(self._forward, 'find_children', group=target)

`find_parent(target)`

Find the parent node of the target.

This method returns a callable object that performs a deferred parent lookup operation. It invokes the underlying implementation to retrieve the parent node of the specified target.

Parameters:

target –

The target for which to find the parent.

Returns:

Callable: Callable object for performing the parent lookup.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, SentenceSplitter
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
>>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
>>> documents.find_parent('children')

Source code in lazyllm/tools/rag/document.py

    def find_parent(self, target) -> Callable:
        """Find the parent node of the target.

This method returns a callable object that performs a deferred parent lookup operation.
It invokes the underlying implementation to retrieve the parent node of the specified target.

Args:
    target: The target for which to find the parent.

**Returns:**

- Callable: Callable object for performing the parent lookup.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="parent", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> documents.create_node_group(name="children", transform=SentenceSplitter, parent="parent", chunk_size=1024, chunk_overlap=100)
    >>> documents.find_parent('children')
    """
        return functools.partial(self._forward, 'find_parent', group=target)

`get_embed()`

Get the embedding placeholder object.

This method returns a placeholder for the embedding layer, allowing deferred binding of the actual embedding implementation. The caller can use this object for embedding-related configuration or extension.

Returns:

EmbedPlaceholder: Embedding placeholder object.

Source code in lazyllm/tools/rag/document.py

    def get_embed(self):
        """Get the embedding placeholder object.

This method returns a placeholder for the embedding layer, allowing deferred binding of the actual embedding implementation.
The caller can use this object for embedding-related configuration or extension.

**Returns:**

- EmbedPlaceholder: Embedding placeholder object.
"""
        return EmbedPlaceholder()

`get_nodes(uids=None, doc_ids=None, group=None, kb_id=None, numbers=None, limit=None, offset=0, return_total=False, sort_by_number=False)`

Get nodes by criteria.

Parameters:

uids (Optional[List[str]], default: None ) –

List of node uids to fetch.
doc_ids (Optional[Set], default: None ) –

Set of document ids to filter by.
group (Optional[str], default: None ) –

Node group name.
kb_id (Optional[str], default: None ) –

Knowledge base id.
numbers (Optional[Set], default: None ) –

Set of node numbers.

Returns:

List[DocNode]: Matched nodes.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document
>>> doc = Document()
>>> nodes = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={1, 2})

Source code in lazyllm/tools/rag/document.py

    def get_nodes(self, uids: Optional[List[str]] = None, doc_ids: Optional[Set] = None,
                  group: Optional[str] = None, kb_id: Optional[str] = None, numbers: Optional[Set] = None,
                  limit: Optional[int] = None, offset: int = 0, return_total: bool = False,
                  sort_by_number: bool = False) -> Union[List[DocNode], Tuple[List[DocNode], int]]:
        """Get nodes by criteria.

Args:
    uids (Optional[List[str]]): List of node uids to fetch.
    doc_ids (Optional[Set]): Set of document ids to filter by.
    group (Optional[str]): Node group name.
    kb_id (Optional[str]): Knowledge base id.
    numbers (Optional[Set]): Set of node numbers.

**Returns:**

- List[DocNode]: Matched nodes.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import Document
    >>> doc = Document()
    >>> nodes = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={1, 2})
    """
        return self._forward(
            '_get_nodes', uids, doc_ids, group, kb_id, numbers, limit, offset, return_total, sort_by_number,
        )

`get_sql_manager()`

Get the NL2SQL manager instance from the SchemaExtractor bound to this document module. Can be used to construct a SqlCall.

Returns:\n - SqlManager: The SQL manager instance.

Source code in lazyllm/tools/rag/document.py

    def get_sql_manager(self):
        """Get the NL2SQL manager instance from the SchemaExtractor bound to this document module. Can be used to construct a SqlCall.

**Returns:**\\n
- SqlManager: The SQL manager instance.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        return ext.sql_manager_for_nl2sql()

`get_store()`

Get the storage placeholder object.

This method returns a placeholder for the storage layer, allowing deferred binding of the actual storage implementation. The caller can use this object for storage-related configuration or extension.

Returns:

StorePlaceholder: Storage placeholder object.

Source code in lazyllm/tools/rag/document.py

    def get_store(self):
        """Get the storage placeholder object.

This method returns a placeholder for the storage layer, allowing deferred binding of the actual storage implementation.
The caller can use this object for storage-related configuration or extension.

**Returns:**

- StorePlaceholder: Storage placeholder object.
"""
        return StorePlaceholder()

`get_window_nodes(node, span=(-5, 5), merge=False)`

Get window nodes around a target node within the same document.

Parameters:

node (DocNode) –

Target node.
span (tuple[int, int], default: (-5, 5) ) –

Window range based on relative offsets of node.number.
merge (bool, default: False ) –

Whether to merge window nodes into a single node.

Returns:

Union[List[DocNode], DocNode]: Window nodes list or a merged node.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document
>>> doc = Document()
>>> node = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={10})[0]
>>> window_nodes = doc.get_window_nodes(node, span=(-2, 2), merge=False)

Source code in lazyllm/tools/rag/document.py

    def get_window_nodes(self, node: DocNode, span: tuple[int, int] = (-5, 5),
                         merge: bool = False) -> Union[List[DocNode], DocNode]:
        """Get window nodes around a target node within the same document.

Args:
    node (DocNode): Target node.
    span (tuple[int, int]): Window range based on relative offsets of node.number.
    merge (bool): Whether to merge window nodes into a single node.

**Returns:**

- Union[List[DocNode], DocNode]: Window nodes list or a merged node.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import Document
    >>> doc = Document()
    >>> node = doc.get_nodes(doc_ids={'doc_1'}, group='CoarseChunk', kb_id='kb_1', numbers={10})[0]
    >>> window_nodes = doc.get_window_nodes(node, span=(-2, 2), merge=False)
    """
        return self._forward('_get_window_nodes', node, span, merge)

`list_all_files_in_directory(dataset_path, skip_hidden_path=True, recursive=True)` `staticmethod`

List all files in a given directory path.

This method recursively or non-recursively traverses a directory and collects all file paths. It can optionally skip hidden files and directories (those starting with '.'). If the provided path is a file instead of a directory, it returns a list containing only that file path.

Parameters:

dataset_path (str) –

The path to the directory to list.
skip_hidden_path (bool, default: True ) –

Whether to skip hidden files and directories (those starting with '.'). Defaults to True.
recursive (bool, default: True ) –

Whether to recursively search subdirectories. If False, only files in the immediate directory are returned. Defaults to True.

Returns:

List[str]: A list of absolute file paths. Returns an empty list if the path does not exist or is not a directory.

Source code in lazyllm/tools/rag/document.py

    @staticmethod
    def list_all_files_in_directory(dataset_path: str, skip_hidden_path: bool = True,
                                    recursive: bool = True) -> List[str]:
        """List all files in a given directory path.

This method recursively or non-recursively traverses a directory and collects all file paths. It can optionally skip hidden files and directories (those starting with '.'). If the provided path is a file instead of a directory, it returns a list containing only that file path.

Args:
    dataset_path (str): The path to the directory to list.
    skip_hidden_path (bool, optional): Whether to skip hidden files and directories (those starting with '.'). Defaults to True.
    recursive (bool, optional): Whether to recursively search subdirectories. If False, only files in the immediate directory are returned. Defaults to True.

**Returns:**

- List[str]: A list of absolute file paths. Returns an empty list if the path does not exist or is not a directory.
"""
        if not os.path.exists(dataset_path):
            return []
        if not os.path.isdir(dataset_path):
            return [dataset_path] if os.path.isfile(dataset_path) else []
        files_list = []
        if recursive:
            for root, dirs, files in os.walk(os.path.abspath(dataset_path)):
                if skip_hidden_path:
                    if any(part.startswith('.') for part in root.split(os.sep) if part):
                        continue
                    dirs[:] = [d for d in dirs if not d.startswith('.')]
                    files = [f for f in files if not f.startswith('.')]
                files_list.extend(os.path.join(root, f) for f in files)
        else:
            for item in os.listdir(dataset_path):
                if skip_hidden_path and item.startswith('.'):
                    continue
                item_path = os.path.join(dataset_path, item)
                if os.path.isfile(item_path):
                    files_list.append(item_path)
        return files_list

`register_global_reader(pattern, func=None)` `classmethod`

Used to specify a file reader, which is visible to all Document objects. The registered file reader must be a Callable object. It can be registered using a decorator or by a function call.

Parameters:

pattern (str) –

Matching rules applied by the file reader.
func (Callable, default: None ) –

File reader, must be a Callable object.

Examples:

>>> from lazyllm.tools.rag import Document, DocNode
>>> @Document.register_global_reader("**/*.yml")
>>> def processYml(file):
...     with open(file, 'r') as f:
...         data = f.read()
...     return [DocNode(text=data)]
...
>>> doc1 = Document(dataset_path="your_files_path")
>>> doc2 = Document(dataset_path="your_files_path")
>>> files = ["your_yml_files"]
>>> docs1 = doc1._impl._reader.load_data(input_files=files)
>>> docs2 = doc2._impl._reader.load_data(input_files=files)
>>> print(docs1[0].text == docs2[0].text)
# True

Source code in lazyllm/tools/rag/document.py

    @classmethod
    def register_global_reader(cls, pattern: str, func: Optional[Callable] = None):
        """
Used to specify a file reader, which is visible to all Document objects. The registered file reader must be a Callable object. It can be registered using a decorator or by a function call.

Args:
    pattern (str): Matching rules applied by the file reader.
    func (Callable): File reader, must be a Callable object.


Examples:

    >>> from lazyllm.tools.rag import Document, DocNode
    >>> @Document.register_global_reader("**/*.yml")
    >>> def processYml(file):
    ...     with open(file, 'r') as f:
    ...         data = f.read()
    ...     return [DocNode(text=data)]
    ...
    >>> doc1 = Document(dataset_path="your_files_path")
    >>> doc2 = Document(dataset_path="your_files_path")
    >>> files = ["your_yml_files"]
    >>> docs1 = doc1._impl._reader.load_data(input_files=files)
    >>> docs2 = doc2._impl._reader.load_data(input_files=files)
    >>> print(docs1[0].text == docs2[0].text)
    # True
    """
        return cls.add_reader(pattern, func)

`register_index(index_type, index_cls, *args, **kwargs)`

Register a new index type.

This method allows users to register a new index type for the document module, enabling extension of retrieval capabilities. Once registered, the index can be referenced by its type.

Parameters:

index_type (str) –

Name of the index type.
index_cls (IndexBase) –

Index class, must inherit from IndexBase.
*args –

Variable arguments for index initialization.
**kwargs –

Keyword arguments for index initialization.

Source code in lazyllm/tools/rag/document.py

    def register_index(self, index_type: str, index_cls: IndexBase, *args, **kwargs) -> None:
        """Register a new index type.

This method allows users to register a new index type for the document module, enabling extension of retrieval capabilities.
Once registered, the index can be referenced by its type.

Args:
    index_type (str): Name of the index type.
    index_cls (IndexBase): Index class, must inherit from ``IndexBase``.
    *args: Variable arguments for index initialization.
    **kwargs: Keyword arguments for index initialization.
"""
        self._impl.register_index(index_type, index_cls, *args, **kwargs)

`register_schema_set(schema_set, kb_id=DEFAULT_KB_ID, force_refresh=False)`

Manually register a Pydantic model as the schema for the current algorithm and bind it to a specific knowledge base. If the KB is already bound to another schema, it raises by default; set force_refresh=True to replace the binding and clean old records.

Parameters:

schema_set (Type[BaseModel]) –

Pydantic model that defines the schema to register.
kb_id (Optional[str], default: DEFAULT_KB_ID ) –

Target knowledge base ID. Defaults to DEFAULT_KB_ID.
force_refresh (bool, default: False ) –

Whether to force refresh when a binding already exists. Defaults to False.

Returns:

str ( str ) –

The generated schema_set_id.

Source code in lazyllm/tools/rag/document.py

    def register_schema_set(self, schema_set: Type[BaseModel], kb_id: Optional[str] = DEFAULT_KB_ID,
                            force_refresh: bool = False) -> str:
        """
Manually register a Pydantic model as the schema for the current algorithm and bind it to a specific knowledge base.
If the KB is already bound to another schema, it raises by default; set ``force_refresh=True`` to replace the binding and clean old records.

Args:
    schema_set (Type[BaseModel]): Pydantic model that defines the schema to register.
    kb_id (Optional[str]): Target knowledge base ID. Defaults to ``DEFAULT_KB_ID``.
    force_refresh (bool): Whether to force refresh when a binding already exists. Defaults to ``False``.

Returns:
    str: The generated ``schema_set_id``.
"""
        return self._forward('_register_schema_set', schema_set, kb_id, force_refresh)

`update_database(llm=None)`

Update the database with information extracted from documents using the SchemaExtractor.

Parameters:

llm (Union[OnlineChatModule, TrainableModule], default: None ) –

LLM for extraction; defaults to the SchemaExtractor's built-in LLM.

Source code in lazyllm/tools/rag/document.py

    def update_database(self, llm: Union[OnlineChatModule, TrainableModule] = None):
        """Update the database with information extracted from documents using the SchemaExtractor.

Args:
    llm (Union[OnlineChatModule, TrainableModule], optional): LLM for extraction; defaults to the SchemaExtractor's built-in LLM.
"""
        ext = self._schema_extractor
        if ext is None:
            raise ValueError('No schema extractor configured for this Document')
        file_paths = self._list_all_files_in_dataset()
        for fp in file_paths:
            ext.extract_and_store(data=fp)

`lazyllm.tools.rag.store.ChromaStore`

Bases: EmbedResolveMixin, LazyLLMStoreBase

ChromaStore is a vector-capable implementation of LazyLLMStoreBase, leveraging Chroma for persistence and vector search.

Parameters:

uri (Optional[str], default: None ) –

URI string for Chroma connection. Required if dir is not provided.
dir (Optional[str], default: None ) –

Filesystem path for local persistent storage. If provided, PersistentClient mode is used.
index_kwargs (Optional[Union[Dict, List]], default: None ) –

Configuration for Chroma collections, e.g., index type and distance metrics.
client_kwargs (Optional[Dict], default: None ) –

Additional arguments passed to the Chroma client constructor.
**kwargs –

Reserved for future extension.

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

class ChromaStore(EmbedResolveMixin, LazyLLMStoreBase):
    """
ChromaStore is a vector-capable implementation of LazyLLMStoreBase, leveraging Chroma for persistence and vector search.

Args:
    uri (Optional[str]): URI string for Chroma connection. Required if `dir` is not provided.
    dir (Optional[str]): Filesystem path for local persistent storage. If provided, PersistentClient mode is used.
    index_kwargs (Optional[Union[Dict, List]]): Configuration for Chroma collections, e.g., index type and distance metrics.
    client_kwargs (Optional[Dict]): Additional arguments passed to the Chroma client constructor.
    **kwargs: Reserved for future extension.
"""
    capability = StoreCapability.VECTOR
    need_embedding = True
    supports_index_registration = False

    def __init__(self, uri: Optional[str] = None, dir: Optional[str] = None,
                 index_kwargs: Optional[Union[Dict, List]] = None, client_kwargs: Optional[Dict] = None,
                 **kwargs) -> None:
        assert uri or (dir), 'uri or dir must be provided'
        self._index_kwargs = index_kwargs or DEFAULT_INDEX_CONFIG
        self._client_kwargs = client_kwargs or {}
        if dir:
            self._dir = dir
        else:
            self._dir, self._host, self._port = self._parse_uri(uri)
        self._primary_key = 'uid'

    @property
    def dir(self):
        """
Directory property of the store.

**Returns:**

- Optional[str]: Normalized directory path ending with a slash, or None if not set.
"""
        if not self._dir: return None
        p = Path(self._dir)
        p = p if p.suffix else (p / 'chroma.sqlite3')
        return str(p.resolve(strict=False))

    def _parse_uri(self, uri: str):
        windows_drive = re.match(r'^[a-zA-Z]:[\\/]', uri or '')
        if ('://' not in uri) and (windows_drive or os.path.isabs(uri)):
            return os.path.abspath(uri), None, None

        p = urlparse(uri)

        if p.scheme == '':
            return os.path.abspath(uri), None, None

        if p.scheme == 'file':
            path = p.path
            if os.name == 'nt' and path.startswith('/') and re.match(r'^/[a-zA-Z]:', path):
                path = path.lstrip('/')  # file:///C:/... -> C:/...
            return os.path.abspath(path), None, None

        scheme = p.scheme
        if scheme.startswith('chroma+'):
            scheme = scheme.split('+', 1)[1]  # http or https

        if scheme in ('http', 'https'):
            host = p.hostname or '127.0.0.1'
            port = p.port or (443 if scheme == 'https' else 80)
            return None, host, port

        raise ValueError(f'Unsupported URI scheme in "{uri}". '
                         'Use file:///path or plain path for local; http(s)://host:port for remote.')

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """
Initialize the Chroma client and configure embedding and global metadata settings.

Args:
    embed_dims (Optional[Dict[str, int]]): Dimensions for each embedding key. Defaults to empty dict if not provided.
    embed_datatypes (Optional[Dict[str, DataType]]): Data types for each embedding key. Only FLOAT_VECTOR or SPARSE_FLOAT_VECTOR are supported.
    global_metadata_desc (Optional[Dict[str, GlobalMetadataDesc]]): Descriptions for global metadata fields. Supported types: string, int, float, bool.
    **kwargs: Reserved for future extension.
"""
        self._global_metadata_desc = global_metadata_desc or {}
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._ddl_lock = threading.Lock()
        for k, v in self._global_metadata_desc.items():
            if v.data_type not in [DataType.VARCHAR, DataType.INT32, DataType.FLOAT, DataType.BOOLEAN]:
                raise ValueError(f'[Chroma Store] Unsupported data type {v.data_type} for global metadata {k}'
                                 ' (only string, int, float, bool are supported)')
        for k, v in self._embed_datatypes.items():
            if v not in [DataType.FLOAT_VECTOR, DataType.SPARSE_FLOAT_VECTOR]:
                raise ValueError(f'[Chroma Store] Unsupported data type {v} for embed key {k}'
                                 ' (only float vector and sparse float vector are supported)')
        if self._dir:
            self._client = chromadb.PersistentClient(path=self._dir, **self._client_kwargs)
            LOG.success(f'Initialzed chroma in path: {self._dir}')
        else:
            self._client = chromadb.HttpClient(host=self._host, port=self._port, **self._client_kwargs)
            LOG.success(f'Initialzed chroma in host: {self._host}, port: {self._port}')

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """
Insert or update a batch of records(segment's uid and vectors) into Chroma.

Args:
    collection_name (str): Logical name for the collection.
    data (List[dict]): List of documents.

**Returns:**

- bool: True if operation succeeds, False otherwise.
"""
        try:
            # NOTE chroma only support single embedding for each collection
            if not data:
                LOG.warning(f'[Chroma Store - upsert] No data to upsert for collection {collection_name}')
                return
            data_embeddings = data[0].get('embedding', {})
            if not data_embeddings: return
            embed_keys = list(data_embeddings.keys())
            for embed_key in embed_keys:
                with self._ddl_lock:
                    self._resolve_missing_embed_specs({embed_key})
                if embed_key not in self._embed_datatypes:
                    raise ValueError(f'Embed key {embed_key} not found in embed_datatypes')
                collection = self._client.get_or_create_collection(
                    name=self._gen_collection_name(collection_name, embed_key), configuration=self._index_kwargs)
                for i in range(0, len(data), INSERT_BATCH_SIZE):
                    collection.upsert(**self._serialize_data(data[i: i + INSERT_BATCH_SIZE], embed_key))
            return True
        except Exception as e:
            LOG.error(f'[Chroma Store - upsert] Failed to create collection {collection_name}: {e}')
            LOG.error(traceback.format_exc())
            raise e

    def _serialize_data(self, data: List[dict], embed_key: str) -> List[dict]:
        res = {'ids': [], 'embeddings': [], 'metadatas': []}
        for d in data:
            res['ids'].append(d.get('uid'))
            res['embeddings'].append(d.get('embedding', {}).get(embed_key))
            res['metadatas'].append({self._gen_global_meta_key(k): v for k, v in d.get('global_meta', {}).items()
                                     if k in self._global_metadata_desc})
        return res

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """
Delete an entire collection or specific records.

Args:
    collection_name (str): Name of the collection to delete from.
    criteria (Optional[dict]): If None, delete the entire collection. Otherwise, a dictionary specifying conditions to delete matching records (e.g., by doc_id, uid, kb_id).
    **kwargs: Reserved for future extension.

**Returns:**

- bool: True if deletion succeeds, False otherwise.
"""
        try:
            if not criteria:
                for embed_key in self._embed_datatypes.keys():
                    try:
                        self._client.delete_collection(name=self._gen_collection_name(collection_name, embed_key))
                    except Exception:
                        continue
                return True
            else:
                filters = self._construct_criteria(criteria)
                for embed_key in self._embed_datatypes.keys():
                    try:
                        collection_name = self._gen_collection_name(collection_name, embed_key)
                        collection = self._client.get_collection(name=collection_name)
                        collection.delete(**filters)
                    except chromadb.errors.NotFoundError:
                        continue
                    except Exception as e:
                        LOG.error(f'[Chroma Store - delete] Failed to delete collection {collection_name}: {e}')
                        LOG.error(traceback.format_exc())
                        raise e
                return True
        except Exception as e:
            LOG.error(f'[Chroma Store - delete] Failed to delete collection {collection_name}: {e}')
            LOG.error(traceback.format_exc())
            return False

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """
Retrieve records matching criteria.

Args:
    collection_name (str): Name of the collection to query.
    criteria (Optional[dict]): Filter conditions such as primary key or metadata (e.g., doc_id, kb_id). If None, retrieves all records.

**Returns:**

- List[dict]: A list of records, where each record contains:
    - 'uid': The unique identifier of the record.
    - 'global_meta': A dictionary of global metadata fields.
    - 'embedding': A dictionary mapping embedding keys to their corresponding vectors.
"""
        try:
            filters = self._construct_criteria(criteria) if criteria else {}
            all_data = []
            for key in self._embed_datatypes:
                try:
                    coll = self._client.get_collection(
                        name=self._gen_collection_name(collection_name, key)
                    )
                    data = coll.get(include=['metadatas', 'embeddings'], **filters)
                    all_data.append((key, data))
                except chromadb.errors.NotFoundError:
                    LOG.error(f'[ChromaStore - get] Collection {collection_name} not found')
                    continue
                except Exception as e:
                    LOG.error(f'[ChromaStore - get] Failed to get collection {collection_name}: {e}')
                    LOG.error(traceback.format_exc())
                    raise e

            res: Dict[str, Dict[str, Any]] = defaultdict(lambda: {
                'uid': None, 'global_meta': {}, 'embedding': {}})
            for embed_key, data in all_data:
                ids = data['ids']
                metas = data['metadatas']
                embs = data['embeddings']

                for uid, meta, emb in zip(ids, metas, embs):
                    entry = res[uid]
                    entry['uid'] = uid
                    if not entry['global_meta']:
                        entry['global_meta'] = {
                            k[len(GLOBAL_META_KEY_PREFIX):]: v
                            for k, v in meta.items()
                        }
                    entry['embedding'][embed_key] = list(emb)
            return list(res.values())
        except Exception as e:
            LOG.error(f'[ChromaStore - get] task fail: {e}')
            LOG.error(traceback.format_exc())

    @override
    def collection_exists(self, collection_name: str) -> bool:
        # Chroma uses one sub-collection per embed_key; the group collection
        # is considered to exist when at least one embed_key sub-collection exists.
        for embed_key in self._embed_datatypes:
            try:
                self._client.get_collection(name=self._gen_collection_name(collection_name, embed_key))
                return True
            except Exception:
                continue
        return False

    @override
    def search(self, collection_name: str, query_embedding: List[float], embed_key: str, topk: Optional[int] = 10,
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               **kwargs) -> List[dict]:
        """
Perform a vector similarity search.

Args:
    collection_name (str): Name of the collection to query.
    query_embedding (List[float]): The query vector for similarity search.
    embed_key (str): The embedding key specifying which embedding space to use.
    topk (int, optional): Number of top results to return. Defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Optional metadata filter conditions to restrict search results.

**Returns:**

- List[dict]: A list of matched records, where each record contains:
    - 'uid': The unique identifier of the matched record.
    - 'score': The similarity score (1 - distance).
"""
        try:
            collection = self._client.get_collection(name=self._gen_collection_name(collection_name, embed_key))

            filters = self._construct_filter_expr(filters) if filters else {}
            query_results = collection.query(query_embeddings=[query_embedding], n_results=topk, **filters)
            res = []
            for i, r_list in enumerate(query_results['ids']):
                for j, uid in enumerate(r_list):
                    dis = query_results['distances'][i][j]
                    res.append({'uid': uid, 'score': 1 - dis})
            return res
        except chromadb.errors.NotFoundError:
            LOG.error(f'[ChromaStore - search] Collection {collection_name} not found')
            return []
        except Exception as e:
            LOG.error(f'[ChromaStore - search] task fail: {e}')
            LOG.error(traceback.format_exc())

    def _construct_criteria(self, criteria: dict) -> dict:
        res = {}
        if self._primary_key in criteria:
            res['ids'] = criteria[self._primary_key]
        else:
            where_conditions = []
            for key, vaule in criteria.items():
                if key not in self._global_metadata_desc:
                    continue
                field_key = self._gen_global_meta_key(key)
                if isinstance(vaule, list):
                    where_conditions.append({field_key: {'$in': vaule}})
                elif isinstance(vaule, str):
                    where_conditions.append({field_key: {'$eq': vaule}})
                else:
                    raise ValueError(f'invalid criteria type: {type(vaule)}')

            if where_conditions:
                if len(where_conditions) == 1:
                    res['where'] = where_conditions[0]
                else:
                    res['where'] = {'$and': where_conditions}
        return res

    def _construct_filter_expr(self, filters: Dict[str, Union[str, int, List, Set]]) -> str:
        where_conditions = []
        for name, candidates in filters.items():
            desc = self._global_metadata_desc.get(name)
            if not desc:
                raise ValueError(f'cannot find desc of field [{name}]')
            key = self._gen_global_meta_key(name)
            if isinstance(candidates, str):
                candidates = [candidates]
            elif (not isinstance(candidates, List)) and (not isinstance(candidates, Set)):
                candidates = list(candidates)
            where_conditions.append({key: {'$in': candidates}})

        if not where_conditions:
            return {}
        elif len(where_conditions) == 1:
            return {'where': where_conditions[0]}
        else:
            return {'where': {'$and': where_conditions}}

    def _gen_global_meta_key(self, k: str) -> str:
        return GLOBAL_META_KEY_PREFIX + k

    def _gen_collection_name(self, collection_name: str, embed_key: str) -> str:
        return collection_name + '_' + embed_key + '_embed'

`dir` `property`

Directory property of the store.

Returns:

Optional[str]: Normalized directory path ending with a slash, or None if not set.

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

Initialize the Chroma client and configure embedding and global metadata settings.

Parameters:

embed_dims (Optional[Dict[str, int]], default: None ) –

Dimensions for each embedding key. Defaults to empty dict if not provided.
embed_datatypes (Optional[Dict[str, DataType]], default: None ) –

Data types for each embedding key. Only FLOAT_VECTOR or SPARSE_FLOAT_VECTOR are supported.
global_metadata_desc (Optional[Dict[str, GlobalMetadataDesc]], default: None ) –

Descriptions for global metadata fields. Supported types: string, int, float, bool.
**kwargs –

Reserved for future extension.

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """
Initialize the Chroma client and configure embedding and global metadata settings.

Args:
    embed_dims (Optional[Dict[str, int]]): Dimensions for each embedding key. Defaults to empty dict if not provided.
    embed_datatypes (Optional[Dict[str, DataType]]): Data types for each embedding key. Only FLOAT_VECTOR or SPARSE_FLOAT_VECTOR are supported.
    global_metadata_desc (Optional[Dict[str, GlobalMetadataDesc]]): Descriptions for global metadata fields. Supported types: string, int, float, bool.
    **kwargs: Reserved for future extension.
"""
        self._global_metadata_desc = global_metadata_desc or {}
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._ddl_lock = threading.Lock()
        for k, v in self._global_metadata_desc.items():
            if v.data_type not in [DataType.VARCHAR, DataType.INT32, DataType.FLOAT, DataType.BOOLEAN]:
                raise ValueError(f'[Chroma Store] Unsupported data type {v.data_type} for global metadata {k}'
                                 ' (only string, int, float, bool are supported)')
        for k, v in self._embed_datatypes.items():
            if v not in [DataType.FLOAT_VECTOR, DataType.SPARSE_FLOAT_VECTOR]:
                raise ValueError(f'[Chroma Store] Unsupported data type {v} for embed key {k}'
                                 ' (only float vector and sparse float vector are supported)')
        if self._dir:
            self._client = chromadb.PersistentClient(path=self._dir, **self._client_kwargs)
            LOG.success(f'Initialzed chroma in path: {self._dir}')
        else:
            self._client = chromadb.HttpClient(host=self._host, port=self._port, **self._client_kwargs)
            LOG.success(f'Initialzed chroma in host: {self._host}, port: {self._port}')

`delete(collection_name, criteria=None, **kwargs)`

Delete an entire collection or specific records.

Parameters:

collection_name (str) –

Name of the collection to delete from.
criteria (Optional[dict], default: None ) –

If None, delete the entire collection. Otherwise, a dictionary specifying conditions to delete matching records (e.g., by doc_id, uid, kb_id).
**kwargs –

Reserved for future extension.

Returns:

bool: True if deletion succeeds, False otherwise.

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """
Delete an entire collection or specific records.

Args:
    collection_name (str): Name of the collection to delete from.
    criteria (Optional[dict]): If None, delete the entire collection. Otherwise, a dictionary specifying conditions to delete matching records (e.g., by doc_id, uid, kb_id).
    **kwargs: Reserved for future extension.

**Returns:**

- bool: True if deletion succeeds, False otherwise.
"""
        try:
            if not criteria:
                for embed_key in self._embed_datatypes.keys():
                    try:
                        self._client.delete_collection(name=self._gen_collection_name(collection_name, embed_key))
                    except Exception:
                        continue
                return True
            else:
                filters = self._construct_criteria(criteria)
                for embed_key in self._embed_datatypes.keys():
                    try:
                        collection_name = self._gen_collection_name(collection_name, embed_key)
                        collection = self._client.get_collection(name=collection_name)
                        collection.delete(**filters)
                    except chromadb.errors.NotFoundError:
                        continue
                    except Exception as e:
                        LOG.error(f'[Chroma Store - delete] Failed to delete collection {collection_name}: {e}')
                        LOG.error(traceback.format_exc())
                        raise e
                return True
        except Exception as e:
            LOG.error(f'[Chroma Store - delete] Failed to delete collection {collection_name}: {e}')
            LOG.error(traceback.format_exc())
            return False

`get(collection_name, criteria=None, **kwargs)`

Retrieve records matching criteria.

Parameters:

collection_name (str) –

Name of the collection to query.
criteria (Optional[dict], default: None ) –

Filter conditions such as primary key or metadata (e.g., doc_id, kb_id). If None, retrieves all records.

Returns:

List[dict]: A list of records, where each record contains:
- 'uid': The unique identifier of the record.
- 'global_meta': A dictionary of global metadata fields.
- 'embedding': A dictionary mapping embedding keys to their corresponding vectors.

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """
Retrieve records matching criteria.

Args:
    collection_name (str): Name of the collection to query.
    criteria (Optional[dict]): Filter conditions such as primary key or metadata (e.g., doc_id, kb_id). If None, retrieves all records.

**Returns:**

- List[dict]: A list of records, where each record contains:
    - 'uid': The unique identifier of the record.
    - 'global_meta': A dictionary of global metadata fields.
    - 'embedding': A dictionary mapping embedding keys to their corresponding vectors.
"""
        try:
            filters = self._construct_criteria(criteria) if criteria else {}
            all_data = []
            for key in self._embed_datatypes:
                try:
                    coll = self._client.get_collection(
                        name=self._gen_collection_name(collection_name, key)
                    )
                    data = coll.get(include=['metadatas', 'embeddings'], **filters)
                    all_data.append((key, data))
                except chromadb.errors.NotFoundError:
                    LOG.error(f'[ChromaStore - get] Collection {collection_name} not found')
                    continue
                except Exception as e:
                    LOG.error(f'[ChromaStore - get] Failed to get collection {collection_name}: {e}')
                    LOG.error(traceback.format_exc())
                    raise e

            res: Dict[str, Dict[str, Any]] = defaultdict(lambda: {
                'uid': None, 'global_meta': {}, 'embedding': {}})
            for embed_key, data in all_data:
                ids = data['ids']
                metas = data['metadatas']
                embs = data['embeddings']

                for uid, meta, emb in zip(ids, metas, embs):
                    entry = res[uid]
                    entry['uid'] = uid
                    if not entry['global_meta']:
                        entry['global_meta'] = {
                            k[len(GLOBAL_META_KEY_PREFIX):]: v
                            for k, v in meta.items()
                        }
                    entry['embedding'][embed_key] = list(emb)
            return list(res.values())
        except Exception as e:
            LOG.error(f'[ChromaStore - get] task fail: {e}')
            LOG.error(traceback.format_exc())

`search(collection_name, query_embedding, embed_key, topk=10, filters=None, **kwargs)`

Perform a vector similarity search.

Parameters:

collection_name (str) –

Name of the collection to query.
query_embedding (List[float]) –

The query vector for similarity search.
embed_key (str) –

The embedding key specifying which embedding space to use.
topk (int, default: 10 ) –

Number of top results to return. Defaults to 10.
filters (Optional[Dict[str, Union[str, int, List, Set]]], default: None ) –

Optional metadata filter conditions to restrict search results.

Returns:

List[dict]: A list of matched records, where each record contains:
- 'uid': The unique identifier of the matched record.
- 'score': The similarity score (1 - distance).

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

    @override
    def search(self, collection_name: str, query_embedding: List[float], embed_key: str, topk: Optional[int] = 10,
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               **kwargs) -> List[dict]:
        """
Perform a vector similarity search.

Args:
    collection_name (str): Name of the collection to query.
    query_embedding (List[float]): The query vector for similarity search.
    embed_key (str): The embedding key specifying which embedding space to use.
    topk (int, optional): Number of top results to return. Defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Optional metadata filter conditions to restrict search results.

**Returns:**

- List[dict]: A list of matched records, where each record contains:
    - 'uid': The unique identifier of the matched record.
    - 'score': The similarity score (1 - distance).
"""
        try:
            collection = self._client.get_collection(name=self._gen_collection_name(collection_name, embed_key))

            filters = self._construct_filter_expr(filters) if filters else {}
            query_results = collection.query(query_embeddings=[query_embedding], n_results=topk, **filters)
            res = []
            for i, r_list in enumerate(query_results['ids']):
                for j, uid in enumerate(r_list):
                    dis = query_results['distances'][i][j]
                    res.append({'uid': uid, 'score': 1 - dis})
            return res
        except chromadb.errors.NotFoundError:
            LOG.error(f'[ChromaStore - search] Collection {collection_name} not found')
            return []
        except Exception as e:
            LOG.error(f'[ChromaStore - search] task fail: {e}')
            LOG.error(traceback.format_exc())

`upsert(collection_name, data)`

Insert or update a batch of records(segment's uid and vectors) into Chroma.

Parameters:

collection_name (str) –

Logical name for the collection.
data (List[dict]) –

List of documents.

Returns:

bool: True if operation succeeds, False otherwise.

Source code in lazyllm/tools/rag/store/vector/chroma_store.py

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """
Insert or update a batch of records(segment's uid and vectors) into Chroma.

Args:
    collection_name (str): Logical name for the collection.
    data (List[dict]): List of documents.

**Returns:**

- bool: True if operation succeeds, False otherwise.
"""
        try:
            # NOTE chroma only support single embedding for each collection
            if not data:
                LOG.warning(f'[Chroma Store - upsert] No data to upsert for collection {collection_name}')
                return
            data_embeddings = data[0].get('embedding', {})
            if not data_embeddings: return
            embed_keys = list(data_embeddings.keys())
            for embed_key in embed_keys:
                with self._ddl_lock:
                    self._resolve_missing_embed_specs({embed_key})
                if embed_key not in self._embed_datatypes:
                    raise ValueError(f'Embed key {embed_key} not found in embed_datatypes')
                collection = self._client.get_or_create_collection(
                    name=self._gen_collection_name(collection_name, embed_key), configuration=self._index_kwargs)
                for i in range(0, len(data), INSERT_BATCH_SIZE):
                    collection.upsert(**self._serialize_data(data[i: i + INSERT_BATCH_SIZE], embed_key))
            return True
        except Exception as e:
            LOG.error(f'[Chroma Store - upsert] Failed to create collection {collection_name}: {e}')
            LOG.error(traceback.format_exc())
            raise e

`lazyllm.tools.rag.store.MilvusStore`

Bases: EmbedResolveMixin, LazyLLMStoreBase

Vector store implementation based on Milvus, inheriting from StoreBase. Supports vector insertion, deletion, flexible querying (including scalar filtering).

Parameters:

uri (str, default: '' ) –

Milvus connection URI (e.g., "tcp://localhost:19530"). If scheme is local file path, uses milvus-lite version; otherwise remote (need to set up a milvus service, e.x. standalone/distributed version).
db_name (str, default: 'lazyllm' ) –

Database name to use in Milvus. Defaults to "lazyllm".
index_kwargs (Optional[Union[Dict, List]], default: None ) –

Index creation parameters (e.g., {"index_type": "IVF_FLAT", "metric_type": "COSINE"} or a list of per-embed-key configs).
client_kwargs (Optional[Dict], default: None ) –

Additional keyword arguments for milvus client.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

class MilvusStore(EmbedResolveMixin, LazyLLMStoreBase):
    """
Vector store implementation based on Milvus, inheriting from StoreBase. Supports vector insertion, deletion, flexible querying (including scalar filtering).

Args:
    uri (str): Milvus connection URI (e.g., "tcp://localhost:19530"). If scheme is local file path, uses milvus-lite version; otherwise remote (need to set up a milvus service, e.x. standalone/distributed version).
    db_name (str): Database name to use in Milvus. Defaults to "lazyllm".
    index_kwargs (Optional[Union[Dict, List]]): Index creation parameters (e.g., {"index_type": "IVF_FLAT", "metric_type": "COSINE"} or a list of per-embed-key configs).
    client_kwargs (Optional[Dict]): Additional keyword arguments for milvus client.
"""
    capability = StoreCapability.VECTOR
    need_embedding = True
    supports_index_registration = False

    def __init__(self, uri: str = '', db_name: str = 'lazyllm', index_kwargs: Optional[Union[Dict, List]] = None,
                 client_kwargs: Optional[Dict] = None):
        # one database, different collection for each group (for standalone, add prefix to collection name)
        # when there's data need upsert, collection creation happen.
        self._uri = uri
        self._db_name = db_name
        self._index_kwargs = index_kwargs
        self._client_kwargs = client_kwargs or {}
        self._primary_key = 'uid'
        if self._uri and parse.urlparse(self._uri).scheme.lower() in ['unix', 'http', 'https', 'tcp', 'grpc']:
            self._is_remote = True
        else:
            self._is_remote = False

    @property
    def dir(self):
        """
Local storage directory derived from URI if running embedded. Returns None when using remote Milvus.

**Returns:**

- Optional[str]: Directory path for local milvus.db file, or None if remote.
"""
        if self._is_remote: return None
        p = Path(self._uri)
        p = p if p.suffix else (p / 'milvus.db')
        return str(p.resolve(strict=False))

    @override
    def try_read_dims_from_schema(self, collections: List[str]) -> Tuple[Dict[str, int], Dict[str, DataType]]:
        embed_dims, embed_datatypes = {}, {}
        if not self._uri:
            return embed_dims, embed_datatypes
        try:
            tmp = pymilvus.MilvusClient(uri=self._uri, **self._client_kwargs)
            if self._is_remote and self._db_name:
                tmp.using_database(self._db_name)
            try:
                sparse_dt = pymilvus.DataType.SPARSE_FLOAT_VECTOR
                sparse_ids = {sparse_dt, getattr(sparse_dt, 'value', None)}
                for collection_name in collections:
                    if not tmp.has_collection(collection_name):
                        continue
                    desc = tmp.describe_collection(collection_name=collection_name)
                    for field in desc.get('fields', []):
                        name = field.get('name', '') or ''
                        if not name.startswith(EMBED_PREFIX):
                            continue
                        key = name[len(EMBED_PREFIX):]
                        raw_dtype = field.get('type', field.get('dtype'))
                        params = field.get('params') or {}
                        dim = params.get('dim')
                        if raw_dtype in sparse_ids:
                            embed_datatypes[key] = DataType.SPARSE_FLOAT_VECTOR
                        elif dim is not None:
                            embed_dims[key] = int(dim)
                            embed_datatypes[key] = DataType.FLOAT_VECTOR
            finally:
                tmp.close()
        except Exception as e:
            LOG.warning(f'[Milvus] Could not read embed dims from schema: {e}')
        return embed_dims, embed_datatypes

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """
Initialize Milvus client, pass in embedding model parameters and global metadata descriptions.

Args:
    embed_dims (Dict[str, int]): Embedding dimensions per embed key.
    embed_datatypes (Dict[str, DataType]): Data types for each embed key.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Descriptions for metadata fields.
    kwargs: Other connection parameters
"""
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._global_metadata_desc = global_metadata_desc or {}
        if self._index_kwargs is not None and self._embed_datatypes:
            self._index_kwargs = self.validate_milvus_embed_keys(self._index_kwargs)
        self._set_constants()

        self._ddl_lock = threading.Lock()
        self._db_ready = False
        self._ensure_database()

        max_pool_size = int(self._client_kwargs.pop('max_pool_size', 8))
        self._client_pool = _ClientPool(self._new_client, max_size=max_pool_size)
        LOG.info('[Milvus Vector Store] init success!')

    def _new_client(self):
        kwargs = dict(self._client_kwargs)
        try:
            c = pymilvus.MilvusClient(uri=self._uri, **kwargs)
            if self._is_remote and self._db_name:
                c.using_database(self._db_name)
            return c
        except Exception as e:
            LOG.error(f'[Milvus Store - _new_client] error: {e}')
            raise e

    def _ensure_database(self):
        if not (self._is_remote and self._db_name) or self._db_ready:
            return
        tmp = pymilvus.MilvusClient(uri=self._uri, **self._client_kwargs)
        try:
            with self._ddl_lock:
                if self._db_ready:
                    return
                need_create = True
                try:
                    db_list = tmp.list_databases()
                    need_create = self._db_name not in db_list
                except Exception:
                    pass
                if need_create:
                    try:
                        tmp.create_database(self._db_name)
                    except Exception as e:
                        if 'already exist' not in str(e).lower():
                            raise
                self._db_ready = True
        finally:
            tmp.close()

    @contextmanager
    def _client_context(self):
        c = self._client_pool.acquire()
        try:
            yield c
        finally:
            self._client_pool.release(c)

    def _row_has_valid_embedding(self, d: dict, required_embed_keys: Set[str]) -> bool:
        """True if row has every collection-required embed key with a non-empty value."""
        emb = d.get('embedding')
        if not emb or not isinstance(emb, dict):
            return False
        for k in required_embed_keys:
            if _is_empty_embedding_value(emb.get(k)):
                return False
        return True

    def _collection_embed_keys(self, client, collection_name: str) -> Set[str]:
        desc = client.describe_collection(collection_name=collection_name)
        return {
            field.get('name')[len(EMBED_PREFIX):]
            for field in desc.get('fields', [])
            if field.get('name', '').startswith(EMBED_PREFIX)
        }

    def _data_embed_keys(self, data: List[dict]) -> Set[str]:
        keys = set()
        for row in data:
            emb = row.get('embedding')
            if not isinstance(emb, dict):
                continue
            keys.update(k for k, v in emb.items() if not _is_empty_embedding_value(v))
        return keys

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:  # noqa: C901
        """
Insert or update a batch of segment data into the Milvus collection.

Args:
    collection_name (str): Collection name (per embed key grouping).
    data (List[dict]): List of segment data.

**Returns:**

- bool: True if successful, False otherwise.
"""
        try:
            if not data: return True
            with self._client_context() as client:
                collection_exists = client.has_collection(collection_name)
                required_embed_keys = (
                    self._collection_embed_keys(client, collection_name)
                    if collection_exists else self._data_embed_keys(data)
                )
                if not required_embed_keys:
                    return True

                # Only require embeddings that belong to this collection. Different node groups may use
                # different embedding models, e.g. text groups use BGE while image groups use SigLIP.
                valid_data = [d for d in data if self._row_has_valid_embedding(d, required_embed_keys)]
                dropped = len(data) - len(valid_data)
                if dropped:
                    LOG.warning(f'[Milvus Store - upsert] Dropping {dropped} rows with missing/empty embedding for '
                                f'collection {collection_name}, required embeddings: {sorted(required_embed_keys)}.')
                data = valid_data
                if not data:
                    return True

                if not collection_exists:
                    with self._ddl_lock:
                        if not client.has_collection(collection_name):
                            self._resolve_missing_embed_specs(required_embed_keys)
                            if self._index_kwargs is not None:
                                self._index_kwargs = self.validate_milvus_embed_keys(self._index_kwargs)
                            embed_kwargs = {}
                            for embed_key in required_embed_keys:
                                assert self._embed_datatypes.get(embed_key), \
                                    f'cannot find embedding params for embed [{embed_key}]'
                                if embed_key not in embed_kwargs:
                                    embed_kwargs[embed_key] = {
                                        'dtype': self._type2milvus[self._embed_datatypes[embed_key]]
                                    }
                                if self._embed_dims.get(embed_key):
                                    embed_kwargs[embed_key]['dim'] = self._embed_dims[embed_key]
                            self._create_collection(client, collection_name, embed_kwargs)

                for i in range(0, len(data), MILVUS_UPSERT_BATCH_SIZE):
                    client.upsert(collection_name=collection_name,
                                  data=[self._serialize_data(d, required_embed_keys)
                                        for d in data[i:i + MILVUS_UPSERT_BATCH_SIZE]])
            return True
        except Exception as e:
            LOG.error(f'[Milvus Store - upsert] error: {e}')
            LOG.error(traceback.format_exc())
            return False

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """
Delete entire collection or subset of records by criteria.

Args:
    collection_name (str): Target collection.
    criteria (Optional[dict]): If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions).
    kwargs: Other delete parameters

**Returns:**

- bool: True if deletion succeeds, False otherwise.
"""
        try:
            with self._client_context() as client:
                if not client.has_collection(collection_name):
                    return True
                client.load_collection(collection_name)
                if not criteria:
                    with self._ddl_lock:
                        if client.has_collection(collection_name):
                            client.drop_collection(collection_name=collection_name)
                else:
                    client.delete(collection_name=collection_name, **self._construct_criteria(criteria))
            return True
        except Exception as e:
            LOG.error(f'[Milvus Store - delete] error: {e}')
            LOG.error(traceback.format_exc())
            return False

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """
Retrieve records matching primary-key or metadata filters.

Args:
    collection_name (str): Collection to query.
    criteria (Optional[dict]): Dict containing 'uid' list or metadata field filters.
    kwargs: Other query parameters

**Returns:**

- List[dict]: Each entry contains 'uid' and 'embedding'.
"""
        try:
            with self._client_context() as client:
                if not client.has_collection(collection_name):
                    return []
                client.load_collection(collection_name)
                col_desc = client.describe_collection(collection_name=collection_name)
                field_names = [field.get('name') for field in col_desc.get('fields', [])
                               if field.get('name').startswith(EMBED_PREFIX)]
                query_kwargs = self._construct_criteria(criteria) if criteria else {}
                if version.parse(pymilvus.__version__) < version.parse('2.4.11'):
                    # For older versions, batch query manually
                    res = self._batch_query_legacy(client, collection_name, field_names, query_kwargs)
                else:
                    if criteria and self._primary_key in criteria:
                        ids = criteria[self._primary_key]
                        if isinstance(ids, str):
                            ids = [ids]
                        query_kwargs = {'filter': f'{self._primary_key} in {ids}'}
                        # return all fields
                        field_names = None
                    else:
                        query_kwargs.update(**kwargs)

                    iterator = client.query_iterator(collection_name=collection_name,
                                                     batch_size=MILVUS_PAGINATION_OFFSET,
                                                     output_fields=field_names, **query_kwargs)
                    res = []
                    while True:
                        result = iterator.next()
                        if not result:
                            iterator.close()
                            break
                        res += result
            return [self._deserialize_data(r) for r in res]
        except Exception as e:
            LOG.error(f'[Milvus Store - get] error: {e}')
            LOG.error(traceback.format_exc())
            return []

    def _batch_query_legacy(self, client, collection_name: str, field_names: List[str], kwargs: dict) -> List[dict]:
        res = []
        offset = 0
        batch_size = MILVUS_PAGINATION_OFFSET

        while True:
            try:
                # Add offset and limit to filters for pagination
                batch_kwargs = dict(kwargs)
                batch_kwargs['offset'] = offset
                batch_kwargs['limit'] = batch_size

                batch_res = client.query(collection_name=collection_name, output_fields=field_names, **batch_kwargs)
                if not batch_res:
                    break

                res.extend(batch_res)
                if len(batch_res) < batch_size:
                    break
                offset += batch_size
            except Exception as e:
                LOG.error(f'[Milvus Store - _batch_query_legacy] error: {e}')
                raise
        return res

    def _set_constants(self):
        self._type2milvus = {
            DataType.VARCHAR: pymilvus.DataType.VARCHAR,
            DataType.ARRAY: pymilvus.DataType.ARRAY,
            DataType.FLOAT_VECTOR: pymilvus.DataType.FLOAT_VECTOR,
            DataType.INT32: pymilvus.DataType.INT32,
            DataType.INT64: pymilvus.DataType.INT64,
            DataType.SPARSE_FLOAT_VECTOR: pymilvus.DataType.SPARSE_FLOAT_VECTOR,
            DataType.STRING: pymilvus.DataType.STRING,
        }
        self._builtin_keys = {
            'uid': {'dtype': pymilvus.DataType.VARCHAR, 'max_length': 256, 'is_primary': True}
        }
        self._constant_fields = self._get_constant_fields()

    def _get_constant_fields(self) -> list:
        field_list = []
        for k, kws in self._builtin_keys.items():
            field_list.append(pymilvus.FieldSchema(name=k, **kws))
        for k, desc in self._global_metadata_desc.items():
            field_name = self._gen_global_meta_key(k)
            if desc.data_type == DataType.ARRAY:
                if desc.element_type is None:
                    raise ValueError(f'Milvus field [{field_name}]: '
                                     '`element_type` is required when `data_type` is ARRAY.')
                field_args = {'element_type': self._type2milvus[desc.element_type], 'max_capacity': desc.max_size}
                if desc.element_type == DataType.VARCHAR: field_args['max_length'] = 65535
            elif desc.data_type == DataType.VARCHAR:
                field_args = {'max_length': desc.max_size}
            else:
                field_args = {}
            field_list.append(pymilvus.FieldSchema(name=field_name, dtype=self._type2milvus[desc.data_type],
                                                   default_value=desc.default_value, **field_args))
        return field_list

    def _create_collection(self, client, collection_name: str, embed_kwargs: Dict[str, Dict],  # noqa: C901
                           retry: int = 0):
        field_list = copy.deepcopy(self._constant_fields)
        index_params = client.prepare_index_params()
        original_index_kwargs = copy.deepcopy(self._index_kwargs)

        # Pre-process index_kwargs to create a lookup dictionary for O(1) access
        index_kwargs_lookup = {}
        if isinstance(original_index_kwargs, dict):
            original_index_kwargs = [original_index_kwargs]
        for item in original_index_kwargs:
            embed_key = item.get('embed_key', None)
            if not embed_key:
                raise ValueError(f'cannot find `embed_key` in `index_kwargs` of `{item}`')
            # add default values to the params of each index item with no overrides
            self._ensure_params_defaults(item)
            index_kwargs_lookup[embed_key] = item.copy()
            index_kwargs_lookup[embed_key].pop('embed_key', None)
        for k, kws in embed_kwargs.items():
            embed_field_name = self._gen_embed_key(k)
            field_list.append(pymilvus.FieldSchema(name=embed_field_name, **kws))

            if k in index_kwargs_lookup:
                index_params.add_index(field_name=embed_field_name, **index_kwargs_lookup[k])

        schema = pymilvus.CollectionSchema(fields=field_list, auto_id=False, enable_dynamic_field=False)
        try:
            client.create_collection(collection_name=collection_name, schema=schema, index_params=index_params)
        except pymilvus.MilvusException as e:
            msg = getattr(e, 'message', str(e))
            if 'invalid index type' in msg.lower():
                if retry >= MILVUS_INDEX_MAX_RETRY:
                    LOG.error(f'[Milvus Store] index fallback exceeded max retries ({MILVUS_INDEX_MAX_RETRY}),'
                              f' last error: {msg}')
                    raise
                try:
                    wrong_index_type = msg.split('invalid index type: ')[1]
                    if ',' in wrong_index_type:
                        wrong_index_type = wrong_index_type.split(',')[0].strip()
                except Exception:
                    LOG.error(f'[Milvus Store] failed to parse invalid index type from error: {msg}')
                    raise
                self._ensure_valid_index(self._index_kwargs)
                LOG.warning(f'[Milvus Store] Unsupported index type: {wrong_index_type}. '
                            f'Fallback to AUTOINDEX and retry (try #{retry + 1}).')
                self._create_collection(client, collection_name, embed_kwargs, retry=retry + 1)
            else:
                raise e

    def _ensure_valid_index(self, index_params: Union[list, dict]):
        embed_index_map = {
            DataType.FLOAT_VECTOR: ['FLAT', 'HNSW', 'IVF_FLAT', 'IVF_SQ8', 'IVF_PQ', 'AUTOINDEX', 'DISKANN'],
            DataType.SPARSE_FLOAT_VECTOR: ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'],
            DataType.VARCHAR: ['INVERTED_INDEX'],
            DataType.STRING: ['INVERTED_INDEX'],
            DataType.ARRAY: ['INVERTED_INDEX'],
            DataType.INT32: ['INVERTED_INDEX'],
            DataType.INT64: ['INVERTED_INDEX'],
            DataType.FLOAT: ['INVERTED_INDEX'],
            DataType.BOOLEAN: ['INVERTED_INDEX'],
        }

        def _replace_index_type(index_item: dict):
            """
            Raise ValueError if the DataType is not supported by Milvus.
            Raise ValueError if the IndexType is not compatible with the DataType.
            Fallback to the default index type if the IndexType is compatible with the DataType
            but not supported by Milvus.
            """
            embed_key = index_item.get('embed_key')
            dtype = self._embed_datatypes.get(embed_key)
            index_type = index_item.get('index_type').upper()
            if dtype not in embed_index_map:
                raise ValueError(f'[Milvus Store]: Unsupported data type: {DataType(dtype).name}.')
            if index_type not in embed_index_map.get(dtype):
                raise ValueError(f'[Milvus Store] {DataType(dtype).name}: Unsupported index type: {index_type}.')
            else:
                index_type = list(embed_index_map.get(dtype))[0]
                index_item['index_type'] = index_type
                self._ensure_params_defaults(index_item)

        if isinstance(index_params, list):
            for index_item in index_params:
                _replace_index_type(index_item)
        elif isinstance(index_params, dict):
            _replace_index_type(index_params)

    def _ensure_params_defaults(self, index_item: dict):
        """
        Fill in the missing fields (index_type, metric_type, params) of a single index item.
        Do not override the fields explicitly provided by the user (only setdefault)
        params will be filled in with common defaults based on index_type
        (if params already exist, only fill in missing keys)
        """
        if not isinstance(index_item, dict):
            return

        # Normalize index_type
        itype = index_item.get('index_type')
        if itype:
            itype_up = str(itype).upper()
            index_item['index_type'] = itype_up
        else:
            raise ValueError(f'cannot find `index_type` in `index_kwargs` of `{index_item}`')

        defaults = MILVUS_INDEX_TYPE_DEFAULTS.get(index_item['index_type'], None)
        if defaults is None:
            raise ValueError(f'[Milvus Store] Unsupported index type: {index_item["index_type"]}')

        # metric_type default fill (do not override user)
        if 'metric_type' not in index_item and 'metric_type' in defaults:
            index_item['metric_type'] = defaults['metric_type']

        default_params = defaults.get('params', {})
        if 'params' not in index_item or index_item.get('params') is None:
            index_item['params'] = dict(default_params)
        else:
            # fill in the missing keys of params
            if isinstance(index_item['params'], dict):
                for k, v in default_params.items():
                    index_item['params'].setdefault(k, v)
            else:
                # if user passed a non-dict (exception), replace it with the default dict
                index_item['params'] = dict(default_params)

    def _serialize_data(self, d: dict, required_embed_keys: Optional[Set[str]] = None) -> dict:
        # only keep primary_key, embedding and global_meta
        res = {
            self._primary_key: d.get(self._primary_key, '')
        }
        emb = d.get('embedding', {})
        embed_keys = required_embed_keys if required_embed_keys is not None else emb.keys()
        for embed_key in embed_keys:
            if embed_key in emb:
                res[self._gen_embed_key(embed_key)] = emb[embed_key]
        global_meta = d.get('global_meta', {})
        for name, desc in self._global_metadata_desc.items():
            value = global_meta.get(name, desc.default_value)
            if value is not None:
                res[self._gen_global_meta_key(name)] = value
        return res

    def _deserialize_data(self, d: dict) -> dict:
        res = {
            self._primary_key: d.get(self._primary_key, ''),
            'embedding': {}
        }
        for k, v in d.items():
            if k.startswith(EMBED_PREFIX):
                res['embedding'][k[len(EMBED_PREFIX):]] = v
        return res

    def _gen_embed_key(self, k: str) -> str:
        return EMBED_PREFIX + k

    def _gen_global_meta_key(self, k: str) -> str:
        return GLOBAL_META_KEY_PREFIX + k

    def _construct_criteria(self, criteria: dict) -> dict:
        res = {}
        criteria = dict(criteria)
        if self._primary_key in criteria:
            res['ids'] = criteria[self._primary_key]
        else:
            filter_str = ''
            for key, vaule in criteria.items():
                if key not in self._global_metadata_desc:
                    continue
                field_name = self._gen_global_meta_key(key)
                if len(filter_str) > 0:
                    filter_str += ' and '
                if isinstance(vaule, list):
                    filter_str += f'{field_name} in {vaule}'
                elif isinstance(vaule, str):
                    filter_str += f'{field_name} == "{vaule}"'
                else:
                    raise ValueError(f'invalid criteria type: {type(vaule)}')
            res['filter'] = filter_str
        return res

    @override
    def collection_exists(self, collection_name: str) -> bool:
        try:
            with self._client_context() as client:
                return client.has_collection(collection_name)
        except Exception as e:
            LOG.warning(f'[Milvus Store - collection_exists] error checking {collection_name}: {e}')
            return False

    @override
    def search(self, collection_name: str, query_embedding: Union[dict, List[float]], topk: int,
               filters: Optional[Dict[str, Union[List, set]]] = None, embed_key: Optional[str] = None,
               filter_str: Optional[str] = '', **kwargs) -> List[dict]:
        """
Perform vector similarity search with optional metadata filtering.

Args:
    collection_name (str): Collection to search.
    query_embedding (List[float]): Query vector.
    topk (int): Number of nearest neighbors.
    filters (Optional[Dict[str, Union[List, Set]]]): Metadata filter map.
    embed_key (str): Which embedding field to use.
    filter_str (Optional[str], optional): 过滤表达式字符串。默认为空字符串
    kwargs: 其他搜索参数

**Returns:**

- List[dict]: Each dict has 'uid' and similarity 'score'.
"""
        with self._client_context() as client:
            if not embed_key or embed_key not in self._embed_datatypes:
                raise ValueError(f'[Milvus Store - search] Not supported or None `embed_key`: {embed_key}')
            if not client.has_collection(collection_name):
                return []
            client.load_collection(collection_name)

            res = []
            filter_expr = self._construct_filter_expr(filters) if filters else ''
            if filter_str:
                filter_expr = f'{filter_expr} and {filter_str}' if filter_expr else filter_str

            results = client.search(collection_name=collection_name, data=[query_embedding], limit=topk,
                                    anns_field=self._gen_embed_key(embed_key),
                                    filter=filter_expr)
            if len(results) != 1:
                raise ValueError(f'number of results [{len(results)}] != expected [1]')
            for result in results[0]:
                score = result.get('distance', 0)
                uid = result.get('id', result.get(self._primary_key, ''))
                if not uid:
                    continue
                res.append({'uid': uid, 'score': score})
        return res

    def _construct_filter_expr(self, filters: Dict[str, Union[str, int, List, Set]]) -> str:
        ret_str = ''
        if not filters:
            return ret_str
        for name, candidates in filters.items():
            desc = self._global_metadata_desc.get(name)
            if not desc:
                raise ValueError(f'cannot find desc of field [{name}]')
            key = self._gen_global_meta_key(name)
            if isinstance(candidates, str):
                candidates = [candidates]
            elif (not isinstance(candidates, list)) and (not isinstance(candidates, set)):
                candidates = list(candidates)
            if desc.data_type == DataType.ARRAY:
                ret_str += f'array_contains_any({key}, {candidates}) and '
            else:
                ret_str += f'{key} in {candidates} and '
        if len(ret_str) > 0:
            return ret_str[:-5]  # truncate the last ' and '
        return ret_str

    def validate_milvus_embed_keys(self, index_kwargs: Optional[Union[List, Dict]]):  # noqa: C901
        """
        Validate and preprocess the index_kwargs of milvus store_conf:
        1. Auto fill the only one missing embed_key into the configuration without embed_key;
        2. The embed_key in self._embed must be a subset of the embed_key in store_conf;
        3. store_conf can contain additional embed_key;
        4. Duplicate embed_key is forbidden;
        5. If multiple embed_key are missing, raise an error.
        """
        if not isinstance(index_kwargs, (list, dict)):
            raise TypeError(f'[Milvus Store] index_kwargs must be a list or dict, but got {type(index_kwargs)}')

        embed_keys = list(self._embed_datatypes.keys())
        if not embed_keys:
            raise ValueError('self._embed is empty, cannot build index configuration')

        normalized_index_kwargs = []
        no_embedkey_entries = []
        seen = set()
        if isinstance(index_kwargs, dict):
            index_kwargs = [index_kwargs]
        for i, idx_conf in enumerate(index_kwargs):
            if not isinstance(idx_conf, dict):
                raise TypeError(f'index_kwargs position {i} must be a dictionary, but got {type(idx_conf)}')

            embed_key = idx_conf.get('embed_key')
            if embed_key:
                if embed_key in seen:
                    raise ValueError(f'duplicate embed_key {embed_key} in index_kwargs position {i}')
                seen.add(embed_key)
            else:
                no_embedkey_entries.append((i, idx_conf))

            normalized_index_kwargs.append(idx_conf)

        store_embed_keys = seen
        missing_keys = set(embed_keys) - store_embed_keys

        if len(missing_keys) > 1:
            raise ValueError(
                f'[Milvus Store] store_conf is missing the following embed_key: {missing_keys} '
                f'(only supports auto filling one missing item)'
            )
        elif len(missing_keys) == 1:
            missing_key = next(iter(missing_keys))

            if len(no_embedkey_entries) == 1:
                idx = no_embedkey_entries[0][1]
                idx['embed_key'] = missing_key
            elif len(no_embedkey_entries) == 0:
                if self._embed_datatypes.get(missing_key) == DataType.FLOAT_VECTOR:
                    normalized_index_kwargs.append({
                        'embed_key': missing_key,
                        'index_type': 'FLAT',
                        'metric_type': 'COSINE'
                    })
                else:
                    normalized_index_kwargs.append({
                        'embed_key': missing_key,
                        'index_type': 'SPARSE_INVERTED_INDEX',
                        'metric_type': 'L2'
                    })
            else:
                raise ValueError(
                    f'[Milvus Store] Found multiple entries without embed_key, cannot determine '
                    f'which one to fill. Missing embed_keys: {missing_keys}'
                )

        return normalized_index_kwargs

`dir` `property`

Local storage directory derived from URI if running embedded. Returns None when using remote Milvus.

Returns:

Optional[str]: Directory path for local milvus.db file, or None if remote.

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

Initialize Milvus client, pass in embedding model parameters and global metadata descriptions.

Parameters:

embed_dims (Dict[str, int], default: None ) –

Embedding dimensions per embed key.
embed_datatypes (Dict[str, DataType], default: None ) –

Data types for each embed key.
global_metadata_desc (Dict[str, GlobalMetadataDesc], default: None ) –

Descriptions for metadata fields.
kwargs –

Other connection parameters

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """
Initialize Milvus client, pass in embedding model parameters and global metadata descriptions.

Args:
    embed_dims (Dict[str, int]): Embedding dimensions per embed key.
    embed_datatypes (Dict[str, DataType]): Data types for each embed key.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Descriptions for metadata fields.
    kwargs: Other connection parameters
"""
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._global_metadata_desc = global_metadata_desc or {}
        if self._index_kwargs is not None and self._embed_datatypes:
            self._index_kwargs = self.validate_milvus_embed_keys(self._index_kwargs)
        self._set_constants()

        self._ddl_lock = threading.Lock()
        self._db_ready = False
        self._ensure_database()

        max_pool_size = int(self._client_kwargs.pop('max_pool_size', 8))
        self._client_pool = _ClientPool(self._new_client, max_size=max_pool_size)
        LOG.info('[Milvus Vector Store] init success!')

`delete(collection_name, criteria=None, **kwargs)`

Delete entire collection or subset of records by criteria.

Parameters:

collection_name (str) –

Target collection.
criteria (Optional[dict], default: None ) –

If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions).
kwargs –

Other delete parameters

Returns:

bool: True if deletion succeeds, False otherwise.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """
Delete entire collection or subset of records by criteria.

Args:
    collection_name (str): Target collection.
    criteria (Optional[dict]): If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions).
    kwargs: Other delete parameters

**Returns:**

- bool: True if deletion succeeds, False otherwise.
"""
        try:
            with self._client_context() as client:
                if not client.has_collection(collection_name):
                    return True
                client.load_collection(collection_name)
                if not criteria:
                    with self._ddl_lock:
                        if client.has_collection(collection_name):
                            client.drop_collection(collection_name=collection_name)
                else:
                    client.delete(collection_name=collection_name, **self._construct_criteria(criteria))
            return True
        except Exception as e:
            LOG.error(f'[Milvus Store - delete] error: {e}')
            LOG.error(traceback.format_exc())
            return False

`get(collection_name, criteria=None, **kwargs)`

Retrieve records matching primary-key or metadata filters.

Parameters:

collection_name (str) –

Collection to query.
criteria (Optional[dict], default: None ) –

Dict containing 'uid' list or metadata field filters.
kwargs –

Other query parameters

Returns:

List[dict]: Each entry contains 'uid' and 'embedding'.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """
Retrieve records matching primary-key or metadata filters.

Args:
    collection_name (str): Collection to query.
    criteria (Optional[dict]): Dict containing 'uid' list or metadata field filters.
    kwargs: Other query parameters

**Returns:**

- List[dict]: Each entry contains 'uid' and 'embedding'.
"""
        try:
            with self._client_context() as client:
                if not client.has_collection(collection_name):
                    return []
                client.load_collection(collection_name)
                col_desc = client.describe_collection(collection_name=collection_name)
                field_names = [field.get('name') for field in col_desc.get('fields', [])
                               if field.get('name').startswith(EMBED_PREFIX)]
                query_kwargs = self._construct_criteria(criteria) if criteria else {}
                if version.parse(pymilvus.__version__) < version.parse('2.4.11'):
                    # For older versions, batch query manually
                    res = self._batch_query_legacy(client, collection_name, field_names, query_kwargs)
                else:
                    if criteria and self._primary_key in criteria:
                        ids = criteria[self._primary_key]
                        if isinstance(ids, str):
                            ids = [ids]
                        query_kwargs = {'filter': f'{self._primary_key} in {ids}'}
                        # return all fields
                        field_names = None
                    else:
                        query_kwargs.update(**kwargs)

                    iterator = client.query_iterator(collection_name=collection_name,
                                                     batch_size=MILVUS_PAGINATION_OFFSET,
                                                     output_fields=field_names, **query_kwargs)
                    res = []
                    while True:
                        result = iterator.next()
                        if not result:
                            iterator.close()
                            break
                        res += result
            return [self._deserialize_data(r) for r in res]
        except Exception as e:
            LOG.error(f'[Milvus Store - get] error: {e}')
            LOG.error(traceback.format_exc())
            return []

`search(collection_name, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)`

Perform vector similarity search with optional metadata filtering.

Parameters:

collection_name (str) –

Collection to search.
query_embedding (List[float]) –

Query vector.
topk (int) –

Number of nearest neighbors.
filters (Optional[Dict[str, Union[List, Set]]], default: None ) –

Metadata filter map.
embed_key (str, default: None ) –

Which embedding field to use.
filter_str (Optional[str], default: '' ) –

过滤表达式字符串。默认为空字符串
kwargs –

其他搜索参数

Returns:

List[dict]: Each dict has 'uid' and similarity 'score'.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

    @override
    def search(self, collection_name: str, query_embedding: Union[dict, List[float]], topk: int,
               filters: Optional[Dict[str, Union[List, set]]] = None, embed_key: Optional[str] = None,
               filter_str: Optional[str] = '', **kwargs) -> List[dict]:
        """
Perform vector similarity search with optional metadata filtering.

Args:
    collection_name (str): Collection to search.
    query_embedding (List[float]): Query vector.
    topk (int): Number of nearest neighbors.
    filters (Optional[Dict[str, Union[List, Set]]]): Metadata filter map.
    embed_key (str): Which embedding field to use.
    filter_str (Optional[str], optional): 过滤表达式字符串。默认为空字符串
    kwargs: 其他搜索参数

**Returns:**

- List[dict]: Each dict has 'uid' and similarity 'score'.
"""
        with self._client_context() as client:
            if not embed_key or embed_key not in self._embed_datatypes:
                raise ValueError(f'[Milvus Store - search] Not supported or None `embed_key`: {embed_key}')
            if not client.has_collection(collection_name):
                return []
            client.load_collection(collection_name)

            res = []
            filter_expr = self._construct_filter_expr(filters) if filters else ''
            if filter_str:
                filter_expr = f'{filter_expr} and {filter_str}' if filter_expr else filter_str

            results = client.search(collection_name=collection_name, data=[query_embedding], limit=topk,
                                    anns_field=self._gen_embed_key(embed_key),
                                    filter=filter_expr)
            if len(results) != 1:
                raise ValueError(f'number of results [{len(results)}] != expected [1]')
            for result in results[0]:
                score = result.get('distance', 0)
                uid = result.get('id', result.get(self._primary_key, ''))
                if not uid:
                    continue
                res.append({'uid': uid, 'score': score})
        return res

`upsert(collection_name, data)`

Insert or update a batch of segment data into the Milvus collection.

Parameters:

collection_name (str) –

Collection name (per embed key grouping).
data (List[dict]) –

List of segment data.

Returns:

bool: True if successful, False otherwise.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:  # noqa: C901
        """
Insert or update a batch of segment data into the Milvus collection.

Args:
    collection_name (str): Collection name (per embed key grouping).
    data (List[dict]): List of segment data.

**Returns:**

- bool: True if successful, False otherwise.
"""
        try:
            if not data: return True
            with self._client_context() as client:
                collection_exists = client.has_collection(collection_name)
                required_embed_keys = (
                    self._collection_embed_keys(client, collection_name)
                    if collection_exists else self._data_embed_keys(data)
                )
                if not required_embed_keys:
                    return True

                # Only require embeddings that belong to this collection. Different node groups may use
                # different embedding models, e.g. text groups use BGE while image groups use SigLIP.
                valid_data = [d for d in data if self._row_has_valid_embedding(d, required_embed_keys)]
                dropped = len(data) - len(valid_data)
                if dropped:
                    LOG.warning(f'[Milvus Store - upsert] Dropping {dropped} rows with missing/empty embedding for '
                                f'collection {collection_name}, required embeddings: {sorted(required_embed_keys)}.')
                data = valid_data
                if not data:
                    return True

                if not collection_exists:
                    with self._ddl_lock:
                        if not client.has_collection(collection_name):
                            self._resolve_missing_embed_specs(required_embed_keys)
                            if self._index_kwargs is not None:
                                self._index_kwargs = self.validate_milvus_embed_keys(self._index_kwargs)
                            embed_kwargs = {}
                            for embed_key in required_embed_keys:
                                assert self._embed_datatypes.get(embed_key), \
                                    f'cannot find embedding params for embed [{embed_key}]'
                                if embed_key not in embed_kwargs:
                                    embed_kwargs[embed_key] = {
                                        'dtype': self._type2milvus[self._embed_datatypes[embed_key]]
                                    }
                                if self._embed_dims.get(embed_key):
                                    embed_kwargs[embed_key]['dim'] = self._embed_dims[embed_key]
                            self._create_collection(client, collection_name, embed_kwargs)

                for i in range(0, len(data), MILVUS_UPSERT_BATCH_SIZE):
                    client.upsert(collection_name=collection_name,
                                  data=[self._serialize_data(d, required_embed_keys)
                                        for d in data[i:i + MILVUS_UPSERT_BATCH_SIZE]])
            return True
        except Exception as e:
            LOG.error(f'[Milvus Store - upsert] error: {e}')
            LOG.error(traceback.format_exc())
            return False

`validate_milvus_embed_keys(index_kwargs)`

Validate and preprocess the index_kwargs of milvus store_conf: 1. Auto fill the only one missing embed_key into the configuration without embed_key; 2. The embed_key in self._embed must be a subset of the embed_key in store_conf; 3. store_conf can contain additional embed_key; 4. Duplicate embed_key is forbidden; 5. If multiple embed_key are missing, raise an error.

Source code in lazyllm/tools/rag/store/vector/milvus_store.py

def validate_milvus_embed_keys(self, index_kwargs: Optional[Union[List, Dict]]):  # noqa: C901
    """
    Validate and preprocess the index_kwargs of milvus store_conf:
    1. Auto fill the only one missing embed_key into the configuration without embed_key;
    2. The embed_key in self._embed must be a subset of the embed_key in store_conf;
    3. store_conf can contain additional embed_key;
    4. Duplicate embed_key is forbidden;
    5. If multiple embed_key are missing, raise an error.
    """
    if not isinstance(index_kwargs, (list, dict)):
        raise TypeError(f'[Milvus Store] index_kwargs must be a list or dict, but got {type(index_kwargs)}')

    embed_keys = list(self._embed_datatypes.keys())
    if not embed_keys:
        raise ValueError('self._embed is empty, cannot build index configuration')

    normalized_index_kwargs = []
    no_embedkey_entries = []
    seen = set()
    if isinstance(index_kwargs, dict):
        index_kwargs = [index_kwargs]
    for i, idx_conf in enumerate(index_kwargs):
        if not isinstance(idx_conf, dict):
            raise TypeError(f'index_kwargs position {i} must be a dictionary, but got {type(idx_conf)}')

        embed_key = idx_conf.get('embed_key')
        if embed_key:
            if embed_key in seen:
                raise ValueError(f'duplicate embed_key {embed_key} in index_kwargs position {i}')
            seen.add(embed_key)
        else:
            no_embedkey_entries.append((i, idx_conf))

        normalized_index_kwargs.append(idx_conf)

    store_embed_keys = seen
    missing_keys = set(embed_keys) - store_embed_keys

    if len(missing_keys) > 1:
        raise ValueError(
            f'[Milvus Store] store_conf is missing the following embed_key: {missing_keys} '
            f'(only supports auto filling one missing item)'
        )
    elif len(missing_keys) == 1:
        missing_key = next(iter(missing_keys))

        if len(no_embedkey_entries) == 1:
            idx = no_embedkey_entries[0][1]
            idx['embed_key'] = missing_key
        elif len(no_embedkey_entries) == 0:
            if self._embed_datatypes.get(missing_key) == DataType.FLOAT_VECTOR:
                normalized_index_kwargs.append({
                    'embed_key': missing_key,
                    'index_type': 'FLAT',
                    'metric_type': 'COSINE'
                })
            else:
                normalized_index_kwargs.append({
                    'embed_key': missing_key,
                    'index_type': 'SPARSE_INVERTED_INDEX',
                    'metric_type': 'L2'
                })
        else:
            raise ValueError(
                f'[Milvus Store] Found multiple entries without embed_key, cannot determine '
                f'which one to fill. Missing embed_keys: {missing_keys}'
            )

    return normalized_index_kwargs

`lazyllm.tools.rag.store.hybrid.hybrid_store.HybridStore`

Bases: LazyLLMStoreBase

Hybrid storage class that combines segment storage and vector storage capabilities.

Parameters:

segment_store (LazyLLMStoreBase) –

Segment storage instance for storing original document content.
vector_store (LazyLLMStoreBase) –

Vector storage instance for storing document vector representations.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

class HybridStore(LazyLLMStoreBase):
    """Hybrid storage class that combines segment storage and vector storage capabilities.

Args:
    segment_store (LazyLLMStoreBase): Segment storage instance for storing original document content.
    vector_store (LazyLLMStoreBase): Vector storage instance for storing document vector representations.
"""
    capability = StoreCapability.ALL
    need_embedding = True
    supports_index_registration = False

    def __init__(self, segment_store: LazyLLMStoreBase, vector_store: LazyLLMStoreBase):
        self.segment_store: LazyLLMStoreBase = segment_store
        self.vector_store: LazyLLMStoreBase = vector_store

    @property
    def dir(self):
        return self.segment_store.dir

    @override
    def seg_connect(self, *args, **kwargs):
        """Connect to the underlying segment store only.

Unlike ``connect()``, ``seg_connect()`` initialises only ``segment_store`` and does not
trigger vector-store connection logic. ``DocumentStore`` calls this method during ``_seg_init()``,
typically passing ``global_metadata_desc`` to register global metadata fields.

Args:
    *args: Positional arguments forwarded to ``segment_store.connect()``.
    **kwargs: Keyword arguments forwarded to ``segment_store.connect()``. Common keys include
        ``global_metadata_desc`` (global metadata schema description).

**Returns:**

- None
"""
        self.segment_store.connect(*args, **kwargs)

    @override
    def vec_connect(self, *args, **kwargs):
        """Connect to the underlying vector store only.

Unlike ``connect()``, ``vec_connect()`` initialises only ``vector_store`` and does not
trigger segment-store connection logic. ``DocumentStore`` calls this method during ``_vec_init()``,
typically passing ``embed_dims``, ``embed_datatypes``, ``collections``, and
``global_metadata_desc`` so the vector backend can create or validate collection schemas.

Args:
    *args: Positional arguments forwarded to ``vector_store.connect()``.
    **kwargs: Keyword arguments forwarded to ``vector_store.connect()``. Common keys include
        ``embed_dims`` (vector dimension per embed key),
        ``embed_datatypes`` (data type per embed key),
        ``global_metadata_desc`` (global metadata schema description),
        ``collections`` (collection names to pre-create).

**Returns:**

- None
"""
        self.vector_store.connect(*args, **kwargs)

    @override
    def connect(self, *args, **kwargs):
        """Connect to underlying segment and vector stores.

Args:
    *args: Positional arguments passed to store connection methods.
    **kwargs: Keyword arguments passed to store connection methods.
"""
        self.seg_connect(*args, **kwargs)
        self.vec_connect(*args, **kwargs)

    @override
    def try_read_dims_from_schema(self, collections: List[str]):
        return self.vector_store.try_read_dims_from_schema(collections)

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data in the stores.

Args:
    collection_name (str): Name of the collection.
    data (List[dict]): List of data items to insert or update, each item is a dictionary.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        segments = [{k: v for k, v in segment.items() if k != 'embedding'} for segment in data]
        return self.segment_store.upsert(collection_name=collection_name, data=segments) and \
            self.vector_store.upsert(collection_name=collection_name, data=data)

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Deletion criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        return self.segment_store.delete(collection_name=collection_name, criteria=criteria, **kwargs) and \
            self.vector_store.delete(collection_name=collection_name, criteria=criteria, **kwargs)

    def drop_collection(self, collection_name: str) -> bool:
        """Drop a collection from both the segment store and the vector store.

Args:
    collection_name (str): Name of the collection to drop.

**Returns:**

- bool: ``True`` if both underlying stores dropped the collection successfully, ``False`` otherwise.
"""
        ok = True
        for store in (self.segment_store, self.vector_store):
            if hasattr(store, 'drop_collection'):
                result = store.drop_collection(collection_name)
                ok = bool(result) and ok
            else:
                lazyllm.LOG.warning(
                    f'[HybridStore] {type(store).__name__} does not implement '
                    f'drop_collection; skipping for collection {collection_name!r}'
                )
        return ok

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Retrieve data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Query criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of matching data items.

Raises:
    ValueError: When a uid found in vector store is not found in segment store.
"""
        res_segments = self.segment_store.get(collection_name=collection_name, criteria=criteria, **kwargs)
        total = None
        if isinstance(res_segments, tuple):
            res_segments, total = res_segments
        if not res_segments:
            return ([], total or 0) if total is not None else []
        uids = [item.get('uid') for item in res_segments]
        res_vectors = self.vector_store.get(collection_name=collection_name, criteria={'uid': uids})

        data = {}
        for item in res_segments:
            data[item.get('uid')] = item
        for item in res_vectors:
            if item.get('uid') in data:
                data[item.get('uid')]['embedding'] = item.get('embedding')
            else:
                raise ValueError(f'[HybridStore - get] uid {item["uid"]} in vector store'
                                 ' but not found in segment store')
        ordered = [data[item.get('uid')] for item in res_segments if item.get('uid') in data]
        return (ordered, total) if total is not None else ordered

    @override
    def collection_exists(self, collection_name: str) -> bool:
        return self.vector_store.collection_exists(collection_name)

    @override
    def search(self, collection_name: str, query: str, query_embedding: Optional[Union[dict, List[float]]] = None,
               topk: int = 10, filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        """Search data in the stores.

Args:
    collection_name (str): Name of the collection.
    query (str): Search query string.
    query_embedding (Optional[Union[dict, List[float]]]): Vector representation of the query, defaults to None.
    topk (int): Maximum number of results to return, defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Filter conditions, defaults to None.
    embed_key (Optional[str]): Key name for embedding vector, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of search results.
"""
        if embed_key:
            # vector store only give uid and score
            res = self.vector_store.search(collection_name=collection_name, query=query, query_embedding=query_embedding,
                                           topk=topk, filters=filters, embed_key=embed_key, **kwargs)
            if not res: return []
            uid2score = {item['uid']: item['score'] for item in res}
            uids = list(uid2score.keys())
            segments = self.segment_store.get(collection_name=collection_name, criteria={'uid': uids})
            uid2segment = {}
            for segment in segments:
                segment['score'] = uid2score.get(segment['uid'], 0)
                uid2segment[segment.get('uid')] = segment
            ordered = [uid2segment[uid] for uid in uids if uid in uid2segment]
            return ordered
        else:
            res = self.segment_store.search(collection_name=collection_name, query=query,
                                            topk=topk, filters=filters, **kwargs)
            return res

`connect(*args, **kwargs)`

Connect to underlying segment and vector stores.

Parameters:

*args –

Positional arguments passed to store connection methods.
**kwargs –

Keyword arguments passed to store connection methods.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def connect(self, *args, **kwargs):
        """Connect to underlying segment and vector stores.

Args:
    *args: Positional arguments passed to store connection methods.
    **kwargs: Keyword arguments passed to store connection methods.
"""
        self.seg_connect(*args, **kwargs)
        self.vec_connect(*args, **kwargs)

`delete(collection_name, criteria=None, **kwargs)`

Delete data from the stores.

Parameters:

collection_name (str) –

Name of the collection.
criteria (Optional[dict], default: None ) –

Deletion criteria, defaults to None.
**kwargs –

Additional arguments.

Returns:

bool: Returns True if operation is successful, False otherwise.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Deletion criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        return self.segment_store.delete(collection_name=collection_name, criteria=criteria, **kwargs) and \
            self.vector_store.delete(collection_name=collection_name, criteria=criteria, **kwargs)

`drop_collection(collection_name)`

Drop a collection from both the segment store and the vector store.

Parameters:

collection_name (str) –

Name of the collection to drop.

Returns:

bool: True if both underlying stores dropped the collection successfully, False otherwise.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    def drop_collection(self, collection_name: str) -> bool:
        """Drop a collection from both the segment store and the vector store.

Args:
    collection_name (str): Name of the collection to drop.

**Returns:**

- bool: ``True`` if both underlying stores dropped the collection successfully, ``False`` otherwise.
"""
        ok = True
        for store in (self.segment_store, self.vector_store):
            if hasattr(store, 'drop_collection'):
                result = store.drop_collection(collection_name)
                ok = bool(result) and ok
            else:
                lazyllm.LOG.warning(
                    f'[HybridStore] {type(store).__name__} does not implement '
                    f'drop_collection; skipping for collection {collection_name!r}'
                )
        return ok

`get(collection_name, criteria=None, **kwargs)`

Retrieve data from the stores.

Parameters:

collection_name (str) –

Name of the collection.
criteria (Optional[dict], default: None ) –

Query criteria, defaults to None.
**kwargs –

Additional arguments.

Returns:

List[dict]: List of matching data items.

Raises:

ValueError –

When a uid found in vector store is not found in segment store.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Retrieve data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Query criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of matching data items.

Raises:
    ValueError: When a uid found in vector store is not found in segment store.
"""
        res_segments = self.segment_store.get(collection_name=collection_name, criteria=criteria, **kwargs)
        total = None
        if isinstance(res_segments, tuple):
            res_segments, total = res_segments
        if not res_segments:
            return ([], total or 0) if total is not None else []
        uids = [item.get('uid') for item in res_segments]
        res_vectors = self.vector_store.get(collection_name=collection_name, criteria={'uid': uids})

        data = {}
        for item in res_segments:
            data[item.get('uid')] = item
        for item in res_vectors:
            if item.get('uid') in data:
                data[item.get('uid')]['embedding'] = item.get('embedding')
            else:
                raise ValueError(f'[HybridStore - get] uid {item["uid"]} in vector store'
                                 ' but not found in segment store')
        ordered = [data[item.get('uid')] for item in res_segments if item.get('uid') in data]
        return (ordered, total) if total is not None else ordered

`search(collection_name, query, query_embedding=None, topk=10, filters=None, embed_key=None, **kwargs)`

Search data in the stores.

Parameters:

collection_name (str) –

Name of the collection.
query (str) –

Search query string.
query_embedding (Optional[Union[dict, List[float]]], default: None ) –

Vector representation of the query, defaults to None.
topk (int, default: 10 ) –

Maximum number of results to return, defaults to 10.
filters (Optional[Dict[str, Union[str, int, List, Set]]], default: None ) –

Filter conditions, defaults to None.
embed_key (Optional[str], default: None ) –

Key name for embedding vector, defaults to None.
**kwargs –

Additional arguments.

Returns:

List[dict]: List of search results.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def search(self, collection_name: str, query: str, query_embedding: Optional[Union[dict, List[float]]] = None,
               topk: int = 10, filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        """Search data in the stores.

Args:
    collection_name (str): Name of the collection.
    query (str): Search query string.
    query_embedding (Optional[Union[dict, List[float]]]): Vector representation of the query, defaults to None.
    topk (int): Maximum number of results to return, defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Filter conditions, defaults to None.
    embed_key (Optional[str]): Key name for embedding vector, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of search results.
"""
        if embed_key:
            # vector store only give uid and score
            res = self.vector_store.search(collection_name=collection_name, query=query, query_embedding=query_embedding,
                                           topk=topk, filters=filters, embed_key=embed_key, **kwargs)
            if not res: return []
            uid2score = {item['uid']: item['score'] for item in res}
            uids = list(uid2score.keys())
            segments = self.segment_store.get(collection_name=collection_name, criteria={'uid': uids})
            uid2segment = {}
            for segment in segments:
                segment['score'] = uid2score.get(segment['uid'], 0)
                uid2segment[segment.get('uid')] = segment
            ordered = [uid2segment[uid] for uid in uids if uid in uid2segment]
            return ordered
        else:
            res = self.segment_store.search(collection_name=collection_name, query=query,
                                            topk=topk, filters=filters, **kwargs)
            return res

`seg_connect(*args, **kwargs)`

Connect to the underlying segment store only.

Unlike connect(), seg_connect() initialises only segment_store and does not trigger vector-store connection logic. DocumentStore calls this method during _seg_init(), typically passing global_metadata_desc to register global metadata fields.

Parameters:

*args –

Positional arguments forwarded to segment_store.connect().
**kwargs –

Keyword arguments forwarded to segment_store.connect(). Common keys include global_metadata_desc (global metadata schema description).

Returns:

None

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def seg_connect(self, *args, **kwargs):
        """Connect to the underlying segment store only.

Unlike ``connect()``, ``seg_connect()`` initialises only ``segment_store`` and does not
trigger vector-store connection logic. ``DocumentStore`` calls this method during ``_seg_init()``,
typically passing ``global_metadata_desc`` to register global metadata fields.

Args:
    *args: Positional arguments forwarded to ``segment_store.connect()``.
    **kwargs: Keyword arguments forwarded to ``segment_store.connect()``. Common keys include
        ``global_metadata_desc`` (global metadata schema description).

**Returns:**

- None
"""
        self.segment_store.connect(*args, **kwargs)

`upsert(collection_name, data)`

Insert or update data in the stores.

Parameters:

collection_name (str) –

Name of the collection.
data (List[dict]) –

List of data items to insert or update, each item is a dictionary.

Returns:

bool: Returns True if operation is successful, False otherwise.

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data in the stores.

Args:
    collection_name (str): Name of the collection.
    data (List[dict]): List of data items to insert or update, each item is a dictionary.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        segments = [{k: v for k, v in segment.items() if k != 'embedding'} for segment in data]
        return self.segment_store.upsert(collection_name=collection_name, data=segments) and \
            self.vector_store.upsert(collection_name=collection_name, data=data)

`vec_connect(*args, **kwargs)`

Connect to the underlying vector store only.

Unlike connect(), vec_connect() initialises only vector_store and does not trigger segment-store connection logic. DocumentStore calls this method during _vec_init(), typically passing embed_dims, embed_datatypes, collections, and global_metadata_desc so the vector backend can create or validate collection schemas.

Parameters:

*args –

Positional arguments forwarded to vector_store.connect().
**kwargs –

Keyword arguments forwarded to vector_store.connect(). Common keys include embed_dims (vector dimension per embed key), embed_datatypes (data type per embed key), global_metadata_desc (global metadata schema description), collections (collection names to pre-create).

Returns:

None

Source code in lazyllm/tools/rag/store/hybrid/hybrid_store.py

    @override
    def vec_connect(self, *args, **kwargs):
        """Connect to the underlying vector store only.

Unlike ``connect()``, ``vec_connect()`` initialises only ``vector_store`` and does not
trigger segment-store connection logic. ``DocumentStore`` calls this method during ``_vec_init()``,
typically passing ``embed_dims``, ``embed_datatypes``, ``collections``, and
``global_metadata_desc`` so the vector backend can create or validate collection schemas.

Args:
    *args: Positional arguments forwarded to ``vector_store.connect()``.
    **kwargs: Keyword arguments forwarded to ``vector_store.connect()``. Common keys include
        ``embed_dims`` (vector dimension per embed key),
        ``embed_datatypes`` (data type per embed key),
        ``global_metadata_desc`` (global metadata schema description),
        ``collections`` (collection names to pre-create).

**Returns:**

- None
"""
        self.vector_store.connect(*args, **kwargs)

`lazyllm.tools.rag.store.hybrid.oceanbase_store.OceanBaseStore`

Bases: EmbedResolveMixin, LazyLLMStoreBase

OceanBase storage class for storing and retrieving document nodes.

Parameters:

uri (str, default: '127.0.0.1:2881' ) –

URI of the OceanBase database.
user (str) –

Username of the OceanBase database.
password (str) –

Password of the OceanBase database.
db_name (str, default: 'test' ) –

Name of the OceanBase database.
drop_old (bool) –

Whether to drop old tables.
index_kwargs (List[dict], default: None ) –

List of index configurations.
client_kwargs (Dict, default: None ) –

Dictionary of client configurations.
max_pool_size (int) –

Maximum pool size.
normalize (bool) –

Whether to normalize data.
enable_fulltext_index (bool) –

Whether to enable fulltext index.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

class OceanBaseStore(EmbedResolveMixin, LazyLLMStoreBase):
    """OceanBase storage class for storing and retrieving document nodes.

Args:
    uri (str): URI of the OceanBase database.
    user (str): Username of the OceanBase database.
    password (str): Password of the OceanBase database.
    db_name (str): Name of the OceanBase database.
    drop_old (bool): Whether to drop old tables.
    index_kwargs (List[dict]): List of index configurations.
    client_kwargs (Dict): Dictionary of client configurations.
    max_pool_size (int): Maximum pool size.
    normalize (bool): Whether to normalize data.
    enable_fulltext_index (bool): Whether to enable fulltext index.
"""
    capability = StoreCapability.ALL
    need_embedding = True
    supports_index_registration = True

    def __init__(self, uri: str = '127.0.0.1:2881', db_name: str = 'test',
                 index_kwargs: Optional[Union[Dict, List]] = None, client_kwargs: Optional[Dict] = None):
        self._uri = uri
        self._db_name = db_name
        self._index_kwargs = index_kwargs or {}
        self._client_kwargs = client_kwargs or {}
        self._ddl_lock = threading.Lock()
        self._embed_datatypes: Union[Dict[str, DataType], Dict[str, Dict]] = {}
        self._global_metadata_desc: Dict[str, GlobalMetadataDesc] = {}
        self._primary_key = 'uid'
        self._hnsw_ef_search = {}

    @contextmanager
    def _client_context(self) -> 'pyobvector.ObVecClient':
        c = self._client_pool.acquire()
        try:
            try:
                c.perform_raw_text_sql('SET SESSION ob_query_timeout = 300000000')
            except Exception as e:
                LOG.warning(f'[OceanBaseStore] Failed to set query timeout in context: {e}')
            yield c
        finally:
            self._client_pool.release(c)

    def _new_client(self):
        kwargs = dict(self._client_kwargs)
        try:
            c = pyobvector.ObVecClient(
                uri=self._uri, db_name=self._db_name,
                user=self._user, password=self._password, **kwargs
            )

            try:
                c.perform_raw_text_sql('SET SESSION ob_query_timeout = 300000000')
                result = c.perform_raw_text_sql("SHOW VARIABLES LIKE 'ob_query_timeout'")
                if result:
                    assert result.fetchone() is not None
            except Exception as e:
                LOG.warning(f'[OceanBaseStore] Failed to set/verify query timeout: {e}')

            LOG.info(f'[OceanBaseStore] Successfully connected to {self._uri}')
            return c
        except Exception as e:
            LOG.error(f'[OceanBaseStore - _new_client] error: {e}')
            raise e

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """Connect to underlying OceanBase database.

Args:
    embed_dims (Dict[str, int]): Dictionary of embedding dimensions.
    embed_datatypes (Dict[str, DataType]): Dictionary of embedding data types.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Dictionary of global metadata descriptions.
    **kwargs: Additional arguments.
"""
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._global_metadata_desc = global_metadata_desc or {}
        self._set_constants()

        # Extract connection parameters from client_kwargs
        self._user = self._parse_user(self._client_kwargs.pop('user', 'root@test'))
        self._password = self._client_kwargs.pop('password', '')
        self._normalize = self._client_kwargs.pop('normalize', False)
        self._enable_fulltext_index = self._client_kwargs.pop('enable_fulltext_index', False)
        max_pool_size = int(self._client_kwargs.pop('max_pool_size', 8))

        self._ensure_database()
        self._client_pool = _ClientPool(self._new_client, max_size=max_pool_size)
        LOG.info('[OceanBaseStore] init success!')

    def upsert(self, collection_name: str, data: List[dict], range_part: Optional['pyobvector.RangeListPartInfo'] = None, **kwargs) -> bool:  # noqa: C901 E501
        """Insert or update data in the stores.

Args:
    collection_name (str): Name of the collection.
    data (List[dict]): List of data items to insert or update, each item is a dictionary.
    range_part (Optional[RangeListPartInfo]): Range partition information, not implemented yet.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        try:
            if not data:
                return True

            if not collection_name or not isinstance(collection_name, str):
                LOG.error('[OceanBaseStore - upsert] Invalid collection_name')
                return False

            all_embed_keys = set()
            for item in data:
                if 'embedding' in item and isinstance(item['embedding'], dict):
                    all_embed_keys.update(item['embedding'].keys())

            with self._client_context() as client:
                with self._ddl_lock:
                    if not client.check_table_exists(collection_name):
                        self._resolve_missing_embed_specs(all_embed_keys)
                        embed_kwargs = {}
                        if all_embed_keys:
                            for embed_key in all_embed_keys:
                                if not self._embed_datatypes.get(embed_key):
                                    LOG.error(f'[OceanBaseStore - upsert] Cannot find embedding for embed [{embed_key}]')
                                    return False

                                if embed_key not in embed_kwargs:
                                    embed_kwargs[embed_key] = {
                                        'dtype': self._type2oceanbase[self._embed_datatypes[embed_key]]
                                    }
                                if self._embed_dims.get(embed_key):
                                    embed_kwargs[embed_key]['dim'] = self._embed_dims[embed_key]

                        self._create_table_and_index(client, collection_name, embed_kwargs, range_part)

                total_inserted = 0
                failed_batches = []

                serialized_data = [self._serialize_data(d) for d in data]

                for i in range(0, len(serialized_data), OB_UPSERT_BATCH_SIZE):
                    batch_num = i // OB_UPSERT_BATCH_SIZE + 1
                    try:
                        if i == 0 or batch_num % 10 == 0:
                            try:
                                client.perform_raw_text_sql('SET SESSION ob_query_timeout = 300000000')
                            except Exception as timeout_err:
                                LOG.warning(f'[OceanBaseStore - upsert] Failed to set '
                                            f'query timeout for batch {batch_num}: {timeout_err}')

                        batch_data = serialized_data[i:i + OB_UPSERT_BATCH_SIZE]
                        client.upsert(table_name=collection_name, data=batch_data)
                        total_inserted += len(batch_data)

                    except Exception as batch_err:
                        LOG.error(f'[OceanBaseStore - upsert] Failed to insert batch {batch_num}: {batch_err}')
                        LOG.error(f'[OceanBaseStore - upsert] Error details: {traceback.format_exc()}')
                        failed_batches.append(batch_num)
                        continue

                if failed_batches:
                    LOG.warning(f'[OceanBaseStore - upsert] Failed batches: {failed_batches}')
                    if total_inserted == 0:
                        return False
            return True
        except Exception as e:
            LOG.error(f'[OceanBaseStore - upsert] Unexpected error: {e}')
            LOG.error(traceback.format_exc())
            return False

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Deletion criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        try:
            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    return True

                if not criteria:
                    with self._ddl_lock:
                        client.drop_table_if_exist(table_name=collection_name)
                else:
                    ids, where_clause = self._get_ids_where_clause(criteria)

                    client.delete(
                        table_name=collection_name,
                        ids=ids,
                        where_clause=where_clause,
                        **kwargs
                    )

            return True
        except Exception as e:
            LOG.error(f'[OceanBaseStore - delete] error: {e}')
            LOG.error(traceback.format_exc())
            return False

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Retrieve data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Query criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of matching data items.
"""
        try:
            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    return []

                ids, where_clause = self._get_ids_where_clause(criteria)

                if ids is None and where_clause is None:
                    return self._get_all_with_pagination(client, collection_name)

                res = client.get(
                    table_name=collection_name,
                    ids=ids,
                    where_clause=where_clause,
                    output_column_name=None,
                    **kwargs
                )

                if not res:
                    return []
                result = [r._mapping for r in res]

                return [self._deserialize_data(r) for r in result]

        except Exception as e:
            LOG.error(f'[OceanBaseStore - get] error: {e}')
            LOG.error(traceback.format_exc())
            return []

    def _get_all_with_pagination(self, client, collection_name: str) -> List[dict]:
        all_results = []
        offset = 0
        batch_size = DEFAULT_OCEANBASE_PAGINATION_OFFSET

        while True:
            try:
                sql = f'SELECT * FROM {collection_name} LIMIT {batch_size} OFFSET {offset}'
                batch_res = client.perform_raw_text_sql(sql)

                if not batch_res:
                    break

                if hasattr(batch_res, 'fetchall'):
                    rows = batch_res.fetchall()
                    if not rows:
                        break
                    columns = batch_res.keys() if hasattr(batch_res, 'keys') else []
                    batch_results = [dict(zip(columns, row)) for row in rows]
                else:
                    batch_results = [r._mapping if hasattr(r, '_mapping') else dict(r) for r in batch_res]

                if not batch_results:
                    break

                all_results.extend([self._deserialize_data(r) for r in batch_results])

                if len(batch_results) < batch_size:
                    break

                offset += batch_size

            except Exception as e:
                LOG.error(f'[OceanBaseStore - _get_all_with_pagination] error at offset {offset}: {e}')
                LOG.error(traceback.format_exc())
                break

        return all_results

    def search(self, collection_name: str, query: str, query_embedding: Union[dict, List[float]], topk: int, filters: Optional[Dict[str, Union[List, set]]] = None, embed_key: Optional[str] = None, filter_str: Optional[str] = '', **kwargs) -> List[dict]:  # noqa: C901 E501
        """Search data in the stores.

Args:
    collection_name (str): Name of the collection.
    query_embedding (Union[dict, List[float]]): Vector representation of the query.
    topk (int): Maximum number of results to return.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Filter conditions, defaults to None.
    embed_key (Optional[str]): Key name for embedding vector, defaults to None.
    filter_str (Optional[str]): Filter conditions string, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of search results.
"""
        if not query_embedding:
            raise NotImplementedError('Query fulltext search is not supported for now')
        try:
            if not collection_name or not isinstance(collection_name, str):
                LOG.error('[OceanBaseStore - search] Invalid collection_name')
                return []

            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    LOG.warning(f'[OceanBaseStore - search] Table {collection_name} does not exist')
                    return []
                if embed_key and embed_key not in self._embed_datatypes:
                    LOG.error(f'[OceanBaseStore - search] embed_key: {embed_key} not exists')
                    return []

                if not embed_key:
                    if self._embed_datatypes:
                        embed_key = next(iter(self._embed_datatypes.keys()))
                        LOG.info(f'[OceanBaseStore - search] No embed_key provided, using default: {embed_key}')
                    else:
                        LOG.error('[OceanBaseStore - search] No embedding datatypes available')
                        return []

                where_clause = None
                if filters or filter_str:
                    filter_parts = []
                    if filters:
                        try:
                            filter_expr = self._construct_filter_expr(filters)
                            if filter_expr:
                                filter_parts.append(filter_expr)
                        except Exception as filter_err:
                            LOG.error(f'[OceanBaseStore - search] Failed to construct filter: {filter_err}')
                            LOG.error(traceback.format_exc())
                            raise RuntimeError(f'Failed to construct filter expression: {filter_err}') from filter_err

                    if filter_str:
                        filter_parts.append(filter_str)

                    if filter_parts:
                        combined_filter = ' and '.join(f'({part})' for part in filter_parts)
                        where_clause = [sqlalchemy.text(combined_filter)]

                if (
                    isinstance(query_embedding, dict)
                    and self._embed_datatypes.get(embed_key) != DataType.SPARSE_FLOAT_VECTOR
                ):
                    vec_data = query_embedding.get(embed_key)
                    if vec_data is None:
                        LOG.error(f'[OceanBaseStore - search] embed_key: {embed_key} not found in query_embedding')
                        return []
                else:
                    vec_data = query_embedding

                if vec_data is None:
                    LOG.error('[OceanBaseStore - search] Vector data is None')
                    return []

                if self._embed_datatypes[embed_key] == DataType.SPARSE_FLOAT_VECTOR:
                    if isinstance(vec_data, dict):
                        vec_data = {int(k) if isinstance(k, str) else k: v for k, v in vec_data.items()}
                    distance_func = pyobvector.inner_product
                else:
                    if self._normalize and isinstance(vec_data, list):
                        vec_data = self._normalize_vector(vec_data)

                    metric_type = self._get_metric_type_for_embed_key(embed_key)
                    distance_func = self._get_distance_function(metric_type)

                search_params = kwargs.get('search_params', {})
                index_type = self._get_index_type_for_embed_key(embed_key)

                if index_type in ['HNSW', 'HNSW_SQ']:
                    ef_search = search_params.get('efSearch', 64)  # Default efSearch
                    if self._hnsw_ef_search.get(embed_key) != ef_search:
                        try:
                            client.set_ob_hnsw_ef_search(ef_search)
                            self._hnsw_ef_search[embed_key] = ef_search
                        except Exception as e:
                            LOG.error(f'[OceanBaseStore - search] Failed to set efSearch: {e}')
                            LOG.error(traceback.format_exc())
                            raise RuntimeError(f'Failed to set efSearch parameter for HNSW index: {e}') from e

                results = client.ann_search(
                    table_name=collection_name,
                    vec_data=vec_data,
                    vec_column_name=self._gen_embed_key(embed_key),
                    distance_func=distance_func,
                    topk=topk,
                    with_dist=True,
                    output_column_names=None,
                    where_clause=where_clause,
                    **kwargs
                )

                res = []
                if not results:
                    LOG.info('[OceanBaseStore - search] No results found')
                    return []

                for row in results:
                    try:
                        if hasattr(row, '_mapping'):
                            row_dict = dict(row._mapping)
                            score = row_dict.pop('distance', 0)
                        elif isinstance(row, dict):
                            row_dict = dict(row)
                            score = row_dict.pop('distance', 0)
                        else:
                            LOG.warning(f'[OceanBaseStore - search] Unsupported row type: {type(row)}')
                            continue

                        doc_data = self._deserialize_data(row_dict)

                        if not doc_data.get(self._primary_key):
                            LOG.warning('[OceanBaseStore - search] Row has no valid uid')
                            continue

                        doc_data['score'] = float(score)
                        res.append(doc_data)

                    except Exception as row_err:
                        LOG.warning(f'[OceanBaseStore - search] Failed to process row: {row_err}')
                        LOG.warning(traceback.format_exc())
                        continue

                LOG.info(f'[OceanBaseStore - search] Returning {len(res)} results')
                return res
        except Exception as e:
            LOG.error(f'[OceanBaseStore - search] Unexpected error: {e}')
            LOG.error(traceback.format_exc())
            return []

    def _create_table_and_index(self, client: 'pyobvector.ObVecClient', collection_name: str, embed_kwargs: Dict, partitions: Optional['pyobvector.RangeListPartInfo'] = None) -> bool:  # noqa: C901 E501
        columns = copy.deepcopy(self._constant_columns)
        indexes = [
            sqlalchemy.Index(f'idx_{collection_name}_parent', 'parent'),
            sqlalchemy.Index(f'idx_{collection_name}_number', 'number'),
        ]

        idx_params = client.prepare_index_params()
        original_index_kwargs = copy.deepcopy(self._index_kwargs)

        index_kwargs_lookup = {}
        fts_idxs = []
        has_explicit_fts = False

        if isinstance(original_index_kwargs, dict):
            original_index_kwargs = [original_index_kwargs]

        for item in original_index_kwargs:
            if not isinstance(item, dict):
                LOG.warning(f'[OceanBaseStore - _create_table_and_index] Invalid index_kwargs item: {item}')
                continue

            embed_key = item.get('embed_key', None)
            if not embed_key:
                has_explicit_fts = True
                field_names = item.get('field_names', ['content'])
                if isinstance(field_names, str):
                    field_names = [field_names]
                index_name = item.get('index_name', f'fts_{field_names[0]}')
                fts_idxs.append(
                    pyobvector.client.fts_index_param.FtsIndexParam(
                        index_name=index_name,
                        field_names=field_names,
                        parser_type=item.get('parser_type', pyobvector.client.fts_index_param.FtsParser.IK),
                    )
                )
                continue

            self._ensure_params_defaults(item)
            index_kwargs_lookup[embed_key] = item.copy()
            index_kwargs_lookup[embed_key].pop('embed_key', None)

        for k, kws in embed_kwargs.items():
            embed_field_name = self._gen_embed_key(k)
            dim = kws.get('dim', None)
            dtype = kws.get('dtype')

            if not dtype:
                LOG.error(f'[OceanBaseStore - _create_table_and_index] No dtype specified for embed_key: {k}')
                raise ValueError(f'No dtype specified for embed_key: {k}')

            if dtype == pyobvector.VECTOR and not dim:
                raise ValueError(f'Embedding `{k}` lacks dim parameter (required for VECTOR type)')

            if dim:
                columns.append(sqlalchemy.Column(embed_field_name, dtype(dim)))
            else:
                columns.append(sqlalchemy.Column(embed_field_name, dtype))

            if k in index_kwargs_lookup:
                index_item = index_kwargs_lookup[k]
                index_type_str = index_item.get('index_type', 'HNSW')

                if index_type_str not in self._oceanbase_supported_vector_index_types:
                    LOG.warning(f'[OceanBaseStore - _create_table_and_index] Unsupported index type: {index_type_str}')
                    continue

                if dtype == pyobvector.VECTOR:
                    idx_params.add_index(
                        field_name=embed_field_name,
                        index_type=self._oceanbase_supported_vector_index_types[index_type_str],
                        metric_type=index_item.get('metric_type', 'l2'),
                        index_name=f'vidx_{k}',
                        params=index_item.get('params', {})
                    )
                else:
                    idx_params.add_index(
                        field_name=embed_field_name,
                        index_type='daat',
                        index_name=f'vidx_{k}',
                        metric_type='inner_product',
                    )
                    LOG.info(f'[OceanBaseStore - _create_table_and_index] Added DAAT index for sparse vector {k}')

            if self._enable_fulltext_index and not has_explicit_fts:
                fts_idxs.append(
                    pyobvector.client.fts_index_param.FtsIndexParam(
                        index_name='fts_content',
                        field_names=['content'],
                        parser_type=pyobvector.client.fts_index_param.FtsParser.IK,
                    )
                )

        try:
            client.create_table_with_index_params(
                table_name=collection_name,
                columns=columns,
                indexes=indexes,
                vidxs=idx_params,
                fts_idxs=fts_idxs if fts_idxs else None,
                partitions=partitions,
            )

            LOG.info(f'[OceanBaseStore - _create_table_and_index] Table {collection_name} created successfully')
            return True

        except Exception as e:
            LOG.error(f'[OceanBaseStore - _create_table_and_index] Failed to create table {collection_name}: {e}')
            LOG.error(traceback.format_exc())
            raise e

    def _get_metric_type_for_embed_key(self, embed_key: str) -> str:

        if isinstance(self._index_kwargs, dict):
            index_kwargs_list = [self._index_kwargs]
        else:
            index_kwargs_list = self._index_kwargs or []

        for index_kwarg in index_kwargs_list:
            if index_kwarg.get('embed_key') == embed_key:
                return index_kwarg.get('metric_type', 'l2')
        return 'l2'

    def _get_distance_function(self, metric_type: str):
        metric_type = metric_type.lower()
        if metric_type == 'inner_product':
            return pyobvector.inner_product
        elif metric_type == 'l2':
            return pyobvector.l2_distance
        elif metric_type == 'cosine':
            return pyobvector.cosine_distance
        else:
            raise ValueError(f'Unsupported metric type: {metric_type}')

    def _get_index_type_for_embed_key(self, embed_key: str) -> str:
        if isinstance(self._index_kwargs, dict):
            index_kwargs_list = [self._index_kwargs]
        else:
            index_kwargs_list = self._index_kwargs or []

        for index_kwarg in index_kwargs_list:
            if index_kwarg.get('embed_key') == embed_key:
                return index_kwarg.get('index_type', 'HNSW').upper()
        return 'HNSW'

    def _normalize_vector(self, vector: List[float]) -> List[float]:
        try:
            arr = np.array(vector)
            norm = np.linalg.norm(arr)
            if norm > 0:
                arr = arr / norm
            return arr.tolist()
        except ImportError:
            norm = math.sqrt(sum(x * x for x in vector))
            if norm > 0:
                return [x / norm for x in vector]
            return vector

    def _serialize_data(self, d: dict) -> dict:
        meta = d.get('meta', {})
        meta_str = meta if isinstance(meta, str) else json.dumps(meta, ensure_ascii=False) if meta else '{}'

        image_keys = d.get('image_keys', [])
        image_keys_str = (
            image_keys if isinstance(image_keys, str)
            else json.dumps(image_keys, ensure_ascii=False) if image_keys
            else '[]'
        )

        excluded_embed = d.get('excluded_embed_metadata_keys', [])
        excluded_embed_str = (
            excluded_embed if isinstance(excluded_embed, str)
            else json.dumps(excluded_embed, ensure_ascii=False) if excluded_embed
            else '[]'
        )

        excluded_llm = d.get('excluded_llm_metadata_keys', [])
        excluded_llm_str = (
            excluded_llm if isinstance(excluded_llm, str)
            else json.dumps(excluded_llm, ensure_ascii=False) if excluded_llm
            else '[]'
        )

        res = {
            self._primary_key: d.get(self._primary_key, ''),
            'doc_id': d.get('doc_id', ''),
            'group': d.get('group', ''),
            'content': d.get('content', ''),
            'meta': meta_str,
            'type': d.get('type', SegmentType.TEXT.value),
            'number': d.get('number', 0),
            'kb_id': d.get('kb_id', ''),
            'parent': d.get('parent', ''),
            'answer': d.get('answer', ''),
            'image_keys': image_keys_str,
            'excluded_embed_metadata_keys': excluded_embed_str,
            'excluded_llm_metadata_keys': excluded_llm_str,
        }

        for embed_key, value in d.get('embedding', {}).items():
            if self._embed_datatypes.get(embed_key) == DataType.SPARSE_FLOAT_VECTOR:
                if isinstance(value, dict):
                    value = {int(k) if isinstance(k, str) else k: v for k, v in value.items()}
            else:
                if self._normalize and isinstance(value, list):
                    value = self._normalize_vector(value)

            res[self._gen_embed_key(embed_key)] = value
        global_meta = d.get('global_meta', {})
        for name, desc in self._global_metadata_desc.items():
            value = global_meta.get(name, desc.default_value)
            if value is not None:
                res[self._gen_global_meta_key(name)] = value

        return res

    def _deserialize_data(self, d: dict) -> dict:
        res = {
            self._primary_key: d.get(self._primary_key, ''),
            'doc_id': d.get('doc_id', ''),
            'group': d.get('group', ''),
            'content': d.get('content', ''),
            'meta': json.loads(d.get('meta', '{}')) if isinstance(d.get('meta'), str) else (d.get('meta') or {}),
            'type': d.get('type', SegmentType.TEXT.value),
            'number': d.get('number', 0),
            'kb_id': d.get('kb_id', ''),
            'parent': d.get('parent', ''),
            'answer': d.get('answer', ''),
            'image_keys': (
                json.loads(d.get('image_keys', '[]')) if isinstance(d.get('image_keys'), str)
                else (d.get('image_keys') or [])
            ),
            'excluded_embed_metadata_keys': (
                json.loads(d.get('excluded_embed_metadata_keys', '[]')) if isinstance(d.get('excluded_embed_metadata_keys'), str)  # noqa: E501
                else (d.get('excluded_embed_metadata_keys') or [])
            ),
            'excluded_llm_metadata_keys': (
                json.loads(d.get('excluded_llm_metadata_keys', '[]')) if isinstance(d.get('excluded_llm_metadata_keys'), str)  # noqa: E501
                else (d.get('excluded_llm_metadata_keys') or [])
            ),
            'embedding': {},
            'global_meta': {}
        }

        for k, v in d.items():
            if k.startswith(EMBED_PREFIX):
                res['embedding'][k[len(EMBED_PREFIX):]] = v
            elif k.startswith(GLOBAL_META_KEY_PREFIX):
                meta_key = k[len(GLOBAL_META_KEY_PREFIX):]
                res['global_meta'][meta_key] = v

        return res

    def _gen_global_meta_key(self, k: str) -> str:
        return GLOBAL_META_KEY_PREFIX + k

    def _gen_embed_key(self, k: str) -> str:
        return EMBED_PREFIX + k

    def _ensure_database(self):
        DB_USER = self._user
        DB_PASSWORD = self._password
        uri_parts = self._uri.split(':')
        if len(uri_parts) < 2:
            raise ValueError(f'Invalid URI format: {self._uri}. Expected format: host:port')
        DB_HOST = uri_parts[0]
        DB_PORT = uri_parts[1]
        DB_NAME = self._db_name

        try:
            engine = sqlalchemy.create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/', pool_pre_ping=True)  # noqa: E501

            with engine.connect() as connection:
                LOG.info('Successfully connected to OceanBase database server!')

                result = connection.execute(sqlalchemy.text('SHOW DATABASES'))
                databases = [row[0] for row in result]

                if DB_NAME in databases:
                    LOG.info(f'Database {DB_NAME} already exists.')
                else:
                    connection.execute(sqlalchemy.text(f'CREATE DATABASE {DB_NAME}'))
                    LOG.info(f'Database {DB_NAME} created successfully!')

        except Exception as e:
            LOG.error(f'[OceanBaseStore - _ensure_database] Unexpected error: {e}')
            raise
        finally:
            engine.dispose()

    def _parse_user(self, user: str) -> str:
        if ':' in user:
            _, tenant, username = user.split(':')
            return f'{username}@{tenant}'
        elif '#' in user:
            username, tenant_cluster = user.split('@')
            tenant, cluster = tenant_cluster.split('#')
            return f'{username}@{tenant}'
        else:
            return user

    def _set_constants(self):
        self._oceanbase_supported_vector_index_types = {
            'HNSW': pyobvector.client.index_param.VecIndexType.HNSW,
            'HNSW_SQ': pyobvector.client.index_param.VecIndexType.HNSW_SQ,
            'IVF': pyobvector.client.index_param.VecIndexType.IVFFLAT,
            'IVF_FLAT': pyobvector.client.index_param.VecIndexType.IVFFLAT,
            'IVF_SQ': pyobvector.client.index_param.VecIndexType.IVFSQ,
            'IVF_PQ': pyobvector.client.index_param.VecIndexType.IVFPQ,
            'FLAT': pyobvector.client.index_param.VecIndexType.IVFFLAT,
        }
        self._type2oceanbase = {
            DataType.ARRAY: pyobvector.ARRAY,
            DataType.FLOAT_VECTOR: pyobvector.VECTOR,
            DataType.SPARSE_FLOAT_VECTOR: pyobvector.SPARSE_VECTOR,
            DataType.STRING: sqlalchemy.dialects.mysql.TEXT,
            DataType.VARCHAR: sqlalchemy.String,
            DataType.INT32: sqlalchemy.Integer,
            DataType.INT64: sqlalchemy.Integer,
        }
        self._builtin_keys = {
            'uid': {'dtype': sqlalchemy.String(512), 'primary_key': True, 'autoincrement': False},
            'doc_id': {'dtype': sqlalchemy.String(512)},
            'group': {'dtype': sqlalchemy.String(512)},
            'content': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
            'meta': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
            'type': {'dtype': sqlalchemy.Integer},
            'number': {'dtype': sqlalchemy.Integer},
            'kb_id': {'dtype': sqlalchemy.String(512)},
            'parent': {'dtype': sqlalchemy.String(512)},
            'answer': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
            'image_keys': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
            'excluded_embed_metadata_keys': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
            'excluded_llm_metadata_keys': {'dtype': sqlalchemy.dialects.mysql.LONGTEXT},
        }
        self._constant_columns = self._get_constant_columns()

    def _get_constant_columns(self) -> list:
        column_list = []
        for k, kws in self._builtin_keys.items():
            kws_copy = dict(kws)
            dtype = kws_copy.pop('dtype')
            column_list.append(sqlalchemy.Column(k, dtype, **kws_copy))
        for k, desc in self._global_metadata_desc.items():
            field_name = self._gen_global_meta_key(k)
            if desc.data_type == DataType.ARRAY:
                if desc.element_type is None:
                    raise ValueError(f'OceanBase field [{field_name}]: '
                                     '`element_type` is required when `data_type` is ARRAY.')
                column_list.append(sqlalchemy.Column(field_name, pyobvector.ARRAY))
            elif desc.data_type == DataType.VARCHAR:
                column_list.append(sqlalchemy.Column(field_name, sqlalchemy.String(desc.max_size)))
            else:
                column_list.append(sqlalchemy.Column(field_name, self._type2oceanbase[desc.data_type]))
        return column_list

    def _ensure_params_defaults(self, index_item: dict):
        itype = index_item.get('index_type')
        if itype:
            itype_up = str(itype).upper()
            index_item['index_type'] = itype_up
        else:
            LOG.error(f'[OceanBaseStore] Cannot find `index_type` in index_kwargs: {index_item}')
            raise ValueError(f'Cannot find `index_type` in `index_kwargs` of `{index_item}`')

        defaults = OCEANBASE_INDEX_TYPE_DEFAULTS.get(index_item['index_type'], None)
        if defaults is None:
            LOG.error(f'[OceanBaseStore] Unsupported index type: {index_item["index_type"]}')
            raise ValueError(f'[OceanBase Store] Unsupported index type: {index_item["index_type"]}')

        if 'metric_type' not in index_item and 'metric_type' in defaults:
            index_item['metric_type'] = defaults['metric_type']
            LOG.info(f'[OceanBaseStore] Using default metric_type: {defaults["metric_type"]}')

        default_params = defaults.get('params', {})
        if 'params' not in index_item or index_item.get('params') is None:
            index_item['params'] = dict(default_params)
        else:
            if isinstance(index_item['params'], dict):
                for k, v in default_params.items():
                    index_item['params'].setdefault(k, v)
            else:
                index_item['params'] = dict(default_params)

    def _construct_where_clause(self, criteria: dict) -> Optional[list]:
        if not criteria:
            return None

        filter_parts = []
        for key, value in criteria.items():
            if key == self._primary_key:
                continue

            if key in self._global_metadata_desc:
                field_name = self._gen_global_meta_key(key)
            elif key in self._builtin_keys:
                field_name = key
            else:
                continue

            if isinstance(value, list):
                if not value:
                    continue
                if isinstance(value[0], str):
                    values_str = ', '.join(f'"{v}"' for v in value)
                else:
                    values_str = ', '.join(str(v) for v in value)
                filter_parts.append(f'{field_name} in ({values_str})')
            elif isinstance(value, str):
                filter_parts.append(f'{field_name} = "{value}"')
            elif isinstance(value, (int, float)):
                filter_parts.append(f'{field_name} = {value}')
            else:
                raise ValueError(f'Unsupported criteria value type: {type(value)} for key: {key}')

        if not filter_parts:
            return None

        combined_filter = ' and '.join(filter_parts)
        return [sqlalchemy.text(combined_filter)]

    def _construct_filter_expr(self, filters: Dict[str, Union[List, set]]) -> str:  # noqa: C901
        if not filters:
            return ''

        filter_parts = []
        for key, value in filters.items():
            try:
                if key not in self._global_metadata_desc.keys():
                    LOG.debug(f'[OceanBaseStore - _construct_filter_expr] Skipping unknown key: {key}')
                    continue

                field_name = self._gen_global_meta_key(key)

                if isinstance(value, (list, set)):
                    value_list = list(value)
                    if not value_list:
                        continue

                    if isinstance(value_list[0], str):
                        escaped_values = [v.replace('"', '\\"') for v in value_list]
                        values_str = ', '.join(f'"{v}"' for v in escaped_values)
                    else:
                        values_str = ', '.join(str(v) for v in value_list)

                    filter_parts.append(f'{field_name} in ({values_str})')

                elif isinstance(value, str):
                    escaped_value = value.replace('"', '\\"')
                    filter_parts.append(f'{field_name} = "{escaped_value}"')

                elif isinstance(value, (int, float)):
                    filter_parts.append(f'{field_name} = {value}')

                elif isinstance(value, bool):
                    filter_parts.append(f'{field_name} = {1 if value else 0}')

                else:
                    continue

            except Exception as e:
                LOG.warning(f'[OceanBaseStore - _construct_filter_expr] Error processing filter {key}={value}: {e}')
                continue

        result = ' and '.join(filter_parts)
        if result:
            LOG.debug(f'[OceanBaseStore - _construct_filter_expr] Filter expression: {result}')
        return result

    def _get_ids_where_clause(self, criteria: dict):
        ids, where_clause = None, None
        if criteria:
            if self._primary_key in criteria:
                ids = (
                    [criteria[self._primary_key]]
                    if isinstance(criteria[self._primary_key], str)
                    else criteria[self._primary_key]
                )
            else:
                where_clause = self._construct_where_clause(criteria)

        return ids, where_clause

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

Connect to underlying OceanBase database.

Parameters:

embed_dims (Dict[str, int], default: None ) –

Dictionary of embedding dimensions.
embed_datatypes (Dict[str, DataType], default: None ) –

Dictionary of embedding data types.
global_metadata_desc (Dict[str, GlobalMetadataDesc], default: None ) –

Dictionary of global metadata descriptions.
**kwargs –

Additional arguments.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

    @override
    def connect(self, embed_dims: Optional[Dict[str, int]] = None,
                embed_datatypes: Optional[Dict[str, DataType]] = None,
                embed: Optional[Dict[str, Callable]] = None,
                global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs):
        """Connect to underlying OceanBase database.

Args:
    embed_dims (Dict[str, int]): Dictionary of embedding dimensions.
    embed_datatypes (Dict[str, DataType]): Dictionary of embedding data types.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Dictionary of global metadata descriptions.
    **kwargs: Additional arguments.
"""
        self._embed_dims = embed_dims or {}
        self._embed_datatypes = embed_datatypes or {}
        self._embed = embed or {}
        self._global_metadata_desc = global_metadata_desc or {}
        self._set_constants()

        # Extract connection parameters from client_kwargs
        self._user = self._parse_user(self._client_kwargs.pop('user', 'root@test'))
        self._password = self._client_kwargs.pop('password', '')
        self._normalize = self._client_kwargs.pop('normalize', False)
        self._enable_fulltext_index = self._client_kwargs.pop('enable_fulltext_index', False)
        max_pool_size = int(self._client_kwargs.pop('max_pool_size', 8))

        self._ensure_database()
        self._client_pool = _ClientPool(self._new_client, max_size=max_pool_size)
        LOG.info('[OceanBaseStore] init success!')

`delete(collection_name, criteria=None, **kwargs)`

Delete data from the stores.

Parameters:

collection_name (str) –

Name of the collection.
criteria (Optional[dict], default: None ) –

Deletion criteria, defaults to None.
**kwargs –

Additional arguments.

Returns:

bool: Returns True if operation is successful, False otherwise.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Deletion criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        try:
            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    return True

                if not criteria:
                    with self._ddl_lock:
                        client.drop_table_if_exist(table_name=collection_name)
                else:
                    ids, where_clause = self._get_ids_where_clause(criteria)

                    client.delete(
                        table_name=collection_name,
                        ids=ids,
                        where_clause=where_clause,
                        **kwargs
                    )

            return True
        except Exception as e:
            LOG.error(f'[OceanBaseStore - delete] error: {e}')
            LOG.error(traceback.format_exc())
            return False

`get(collection_name, criteria=None, **kwargs)`

Retrieve data from the stores.

Parameters:

collection_name (str) –

Name of the collection.
criteria (Optional[dict], default: None ) –

Query criteria, defaults to None.
**kwargs –

Additional arguments.

Returns:

List[dict]: List of matching data items.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Retrieve data from the stores.

Args:
    collection_name (str): Name of the collection.
    criteria (Optional[dict]): Query criteria, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of matching data items.
"""
        try:
            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    return []

                ids, where_clause = self._get_ids_where_clause(criteria)

                if ids is None and where_clause is None:
                    return self._get_all_with_pagination(client, collection_name)

                res = client.get(
                    table_name=collection_name,
                    ids=ids,
                    where_clause=where_clause,
                    output_column_name=None,
                    **kwargs
                )

                if not res:
                    return []
                result = [r._mapping for r in res]

                return [self._deserialize_data(r) for r in result]

        except Exception as e:
            LOG.error(f'[OceanBaseStore - get] error: {e}')
            LOG.error(traceback.format_exc())
            return []

`search(collection_name, query, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)`

Search data in the stores.

Parameters:

collection_name (str) –

Name of the collection.
query_embedding (Union[dict, List[float]]) –

Vector representation of the query.
topk (int) –

Maximum number of results to return.
filters (Optional[Dict[str, Union[str, int, List, Set]]], default: None ) –

Filter conditions, defaults to None.
embed_key (Optional[str], default: None ) –

Key name for embedding vector, defaults to None.
filter_str (Optional[str], default: '' ) –

Filter conditions string, defaults to None.
**kwargs –

Additional arguments.

Returns:

List[dict]: List of search results.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

    def search(self, collection_name: str, query: str, query_embedding: Union[dict, List[float]], topk: int, filters: Optional[Dict[str, Union[List, set]]] = None, embed_key: Optional[str] = None, filter_str: Optional[str] = '', **kwargs) -> List[dict]:  # noqa: C901 E501
        """Search data in the stores.

Args:
    collection_name (str): Name of the collection.
    query_embedding (Union[dict, List[float]]): Vector representation of the query.
    topk (int): Maximum number of results to return.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Filter conditions, defaults to None.
    embed_key (Optional[str]): Key name for embedding vector, defaults to None.
    filter_str (Optional[str]): Filter conditions string, defaults to None.
    **kwargs: Additional arguments.

**Returns:**

- List[dict]: List of search results.
"""
        if not query_embedding:
            raise NotImplementedError('Query fulltext search is not supported for now')
        try:
            if not collection_name or not isinstance(collection_name, str):
                LOG.error('[OceanBaseStore - search] Invalid collection_name')
                return []

            with self._client_context() as client:
                if not client.check_table_exists(collection_name):
                    LOG.warning(f'[OceanBaseStore - search] Table {collection_name} does not exist')
                    return []
                if embed_key and embed_key not in self._embed_datatypes:
                    LOG.error(f'[OceanBaseStore - search] embed_key: {embed_key} not exists')
                    return []

                if not embed_key:
                    if self._embed_datatypes:
                        embed_key = next(iter(self._embed_datatypes.keys()))
                        LOG.info(f'[OceanBaseStore - search] No embed_key provided, using default: {embed_key}')
                    else:
                        LOG.error('[OceanBaseStore - search] No embedding datatypes available')
                        return []

                where_clause = None
                if filters or filter_str:
                    filter_parts = []
                    if filters:
                        try:
                            filter_expr = self._construct_filter_expr(filters)
                            if filter_expr:
                                filter_parts.append(filter_expr)
                        except Exception as filter_err:
                            LOG.error(f'[OceanBaseStore - search] Failed to construct filter: {filter_err}')
                            LOG.error(traceback.format_exc())
                            raise RuntimeError(f'Failed to construct filter expression: {filter_err}') from filter_err

                    if filter_str:
                        filter_parts.append(filter_str)

                    if filter_parts:
                        combined_filter = ' and '.join(f'({part})' for part in filter_parts)
                        where_clause = [sqlalchemy.text(combined_filter)]

                if (
                    isinstance(query_embedding, dict)
                    and self._embed_datatypes.get(embed_key) != DataType.SPARSE_FLOAT_VECTOR
                ):
                    vec_data = query_embedding.get(embed_key)
                    if vec_data is None:
                        LOG.error(f'[OceanBaseStore - search] embed_key: {embed_key} not found in query_embedding')
                        return []
                else:
                    vec_data = query_embedding

                if vec_data is None:
                    LOG.error('[OceanBaseStore - search] Vector data is None')
                    return []

                if self._embed_datatypes[embed_key] == DataType.SPARSE_FLOAT_VECTOR:
                    if isinstance(vec_data, dict):
                        vec_data = {int(k) if isinstance(k, str) else k: v for k, v in vec_data.items()}
                    distance_func = pyobvector.inner_product
                else:
                    if self._normalize and isinstance(vec_data, list):
                        vec_data = self._normalize_vector(vec_data)

                    metric_type = self._get_metric_type_for_embed_key(embed_key)
                    distance_func = self._get_distance_function(metric_type)

                search_params = kwargs.get('search_params', {})
                index_type = self._get_index_type_for_embed_key(embed_key)

                if index_type in ['HNSW', 'HNSW_SQ']:
                    ef_search = search_params.get('efSearch', 64)  # Default efSearch
                    if self._hnsw_ef_search.get(embed_key) != ef_search:
                        try:
                            client.set_ob_hnsw_ef_search(ef_search)
                            self._hnsw_ef_search[embed_key] = ef_search
                        except Exception as e:
                            LOG.error(f'[OceanBaseStore - search] Failed to set efSearch: {e}')
                            LOG.error(traceback.format_exc())
                            raise RuntimeError(f'Failed to set efSearch parameter for HNSW index: {e}') from e

                results = client.ann_search(
                    table_name=collection_name,
                    vec_data=vec_data,
                    vec_column_name=self._gen_embed_key(embed_key),
                    distance_func=distance_func,
                    topk=topk,
                    with_dist=True,
                    output_column_names=None,
                    where_clause=where_clause,
                    **kwargs
                )

                res = []
                if not results:
                    LOG.info('[OceanBaseStore - search] No results found')
                    return []

                for row in results:
                    try:
                        if hasattr(row, '_mapping'):
                            row_dict = dict(row._mapping)
                            score = row_dict.pop('distance', 0)
                        elif isinstance(row, dict):
                            row_dict = dict(row)
                            score = row_dict.pop('distance', 0)
                        else:
                            LOG.warning(f'[OceanBaseStore - search] Unsupported row type: {type(row)}')
                            continue

                        doc_data = self._deserialize_data(row_dict)

                        if not doc_data.get(self._primary_key):
                            LOG.warning('[OceanBaseStore - search] Row has no valid uid')
                            continue

                        doc_data['score'] = float(score)
                        res.append(doc_data)

                    except Exception as row_err:
                        LOG.warning(f'[OceanBaseStore - search] Failed to process row: {row_err}')
                        LOG.warning(traceback.format_exc())
                        continue

                LOG.info(f'[OceanBaseStore - search] Returning {len(res)} results')
                return res
        except Exception as e:
            LOG.error(f'[OceanBaseStore - search] Unexpected error: {e}')
            LOG.error(traceback.format_exc())
            return []

`upsert(collection_name, data, range_part=None, **kwargs)`

Insert or update data in the stores.

Parameters:

collection_name (str) –

Name of the collection.
data (List[dict]) –

List of data items to insert or update, each item is a dictionary.
range_part (Optional[RangeListPartInfo], default: None ) –

Range partition information, not implemented yet.
**kwargs –

Additional arguments.

Returns:

bool: Returns True if operation is successful, False otherwise.

Source code in lazyllm/tools/rag/store/hybrid/oceanbase_store.py

    def upsert(self, collection_name: str, data: List[dict], range_part: Optional['pyobvector.RangeListPartInfo'] = None, **kwargs) -> bool:  # noqa: C901 E501
        """Insert or update data in the stores.

Args:
    collection_name (str): Name of the collection.
    data (List[dict]): List of data items to insert or update, each item is a dictionary.
    range_part (Optional[RangeListPartInfo]): Range partition information, not implemented yet.
    **kwargs: Additional arguments.

**Returns:**

- bool: Returns True if operation is successful, False otherwise.
"""
        try:
            if not data:
                return True

            if not collection_name or not isinstance(collection_name, str):
                LOG.error('[OceanBaseStore - upsert] Invalid collection_name')
                return False

            all_embed_keys = set()
            for item in data:
                if 'embedding' in item and isinstance(item['embedding'], dict):
                    all_embed_keys.update(item['embedding'].keys())

            with self._client_context() as client:
                with self._ddl_lock:
                    if not client.check_table_exists(collection_name):
                        self._resolve_missing_embed_specs(all_embed_keys)
                        embed_kwargs = {}
                        if all_embed_keys:
                            for embed_key in all_embed_keys:
                                if not self._embed_datatypes.get(embed_key):
                                    LOG.error(f'[OceanBaseStore - upsert] Cannot find embedding for embed [{embed_key}]')
                                    return False

                                if embed_key not in embed_kwargs:
                                    embed_kwargs[embed_key] = {
                                        'dtype': self._type2oceanbase[self._embed_datatypes[embed_key]]
                                    }
                                if self._embed_dims.get(embed_key):
                                    embed_kwargs[embed_key]['dim'] = self._embed_dims[embed_key]

                        self._create_table_and_index(client, collection_name, embed_kwargs, range_part)

                total_inserted = 0
                failed_batches = []

                serialized_data = [self._serialize_data(d) for d in data]

                for i in range(0, len(serialized_data), OB_UPSERT_BATCH_SIZE):
                    batch_num = i // OB_UPSERT_BATCH_SIZE + 1
                    try:
                        if i == 0 or batch_num % 10 == 0:
                            try:
                                client.perform_raw_text_sql('SET SESSION ob_query_timeout = 300000000')
                            except Exception as timeout_err:
                                LOG.warning(f'[OceanBaseStore - upsert] Failed to set '
                                            f'query timeout for batch {batch_num}: {timeout_err}')

                        batch_data = serialized_data[i:i + OB_UPSERT_BATCH_SIZE]
                        client.upsert(table_name=collection_name, data=batch_data)
                        total_inserted += len(batch_data)

                    except Exception as batch_err:
                        LOG.error(f'[OceanBaseStore - upsert] Failed to insert batch {batch_num}: {batch_err}')
                        LOG.error(f'[OceanBaseStore - upsert] Error details: {traceback.format_exc()}')
                        failed_batches.append(batch_num)
                        continue

                if failed_batches:
                    LOG.warning(f'[OceanBaseStore - upsert] Failed batches: {failed_batches}')
                    if total_inserted == 0:
                        return False
            return True
        except Exception as e:
            LOG.error(f'[OceanBaseStore - upsert] Unexpected error: {e}')
            LOG.error(traceback.format_exc())
            return False

`lazyllm.tools.rag.store.ElasticSearchStore`

Bases: LazyLLMStoreBase

Vector store implementation based on Elasticsearch, inheriting from StoreBase. Supports vector insertion, deletion, flexible querying (including scalar filtering). Args: uris (List[str]): Elasticsearch connection URIs (e.g., ["http://localhost:9200"]). client_kwargs (Optional[Dict]): Additional keyword arguments for Elasticsearch client. index_kwargs (Optional[Union[Dict, List]]): Index creation parameters (e.g., {"index_type": "IVF_FLAT", "metric_type": "COSINE"} or a list of per-embed-key configs). **kwargs: Additional keyword arguments.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.store import ElasticSearchStore
>>> store = ElasticSearchStore(uris=["localhost:9200"], client_kwargs={}, index_kwargs={})
>>> store.connect(embed_dims={"vec_dense": 128, "vec_sparse": 128}, embed_datatypes={"vec_dense": DataType.FLOAT32, "vec_sparse": DataType.FLOAT32}, global_metadata_desc={})
>>> store.upsert(collection_name="test", data=[{"uid": "1", "embedding": {"vec_dense": [0.1, 0.2, 0.3], "vec_sparse": {"1": 0.1, "2": 0.2, "3": 0.3}}, "metadata": {"key1": "value1", "key2": "value2"}}])
>>> store.get(collection_name="test", criteria={"uid": "1"})
>>> store.delete(collection_name="test", criteria={"uid": "1"})

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

class ElasticSearchStore(LazyLLMStoreBase):
    """
Vector store implementation based on Elasticsearch, inheriting from StoreBase. Supports vector insertion, deletion, flexible querying (including scalar filtering).
Args:
    uris (List[str]): Elasticsearch connection URIs (e.g., ["http://localhost:9200"]).
    client_kwargs (Optional[Dict]): Additional keyword arguments for Elasticsearch client.
    index_kwargs (Optional[Union[Dict, List]]): Index creation parameters (e.g., {"index_type": "IVF_FLAT", "metric_type": "COSINE"} or a list of per-embed-key configs).
    **kwargs: Additional keyword arguments.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.store import ElasticSearchStore
    >>> store = ElasticSearchStore(uris=["localhost:9200"], client_kwargs={}, index_kwargs={})
    >>> store.connect(embed_dims={"vec_dense": 128, "vec_sparse": 128}, embed_datatypes={"vec_dense": DataType.FLOAT32, "vec_sparse": DataType.FLOAT32}, global_metadata_desc={})
    >>> store.upsert(collection_name="test", data=[{"uid": "1", "embedding": {"vec_dense": [0.1, 0.2, 0.3], "vec_sparse": {"1": 0.1, "2": 0.2, "3": 0.3}}, "metadata": {"key1": "value1", "key2": "value2"}}])
    >>> store.get(collection_name="test", criteria={"uid": "1"})
    >>> store.delete(collection_name="test", criteria={"uid": "1"})
    """
    capability = StoreCapability.SEGMENT
    need_embedding = False
    supports_index_registration = False

    def __init__(
        self,
        uris: List[str],
        client_kwargs: Optional[Dict] = None,
        index_kwargs: Optional[Union[Dict, List]] = None,
        **kwargs,
    ):
        if isinstance(uris, str):
            uris = [uris]

        self._uris = uris
        self._client_kwargs = client_kwargs or {}
        self._index_kwargs = index_kwargs or DEFAULT_MAPPING_BODY
        self._primary_key = 'uid'

    @property
    def dir(self):
        """
Returns None when using remote Elasticsearch.
**Returns:**

    Optional[str]: None if remote.
"""
        return None

    @override
    def connect(self, global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs) -> bool:
        """
Initialize Elasticsearch client, pass in embedding model parameters and global metadata descriptions.
Args:
    embed_dims (Dict[str, int]): Embedding dimensions per embed key.
    embed_datatypes (Dict[str, DataType]): Data types for each embed key.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Descriptions for metadata fields.
**Returns:**

    bool: True if successful, False otherwise.
"""
        try:
            self._ddl_lock = threading.Lock()
            # Elastic Cloud
            if self._client_kwargs.get('cloud_id') and self._client_kwargs.get('api_key'):
                cloud_id = self._client_kwargs.get('cloud_id')
                api_key = self._client_kwargs.get('api_key')
                request_timeout = self._client_kwargs.get('request_timeout', 30)

                self._client = elasticsearch.Elasticsearch(
                    cloud_id=cloud_id, api_key=api_key, request_timeout=request_timeout
                )
                if not self._client.ping():
                    raise ConnectionError(f'Failed to ping ES {self._uris}')
            else:
                client_kwargs = dict(self._client_kwargs)
                if 'request_timeout' not in client_kwargs:
                    client_kwargs['request_timeout'] = 30
                self._client = elasticsearch.Elasticsearch(hosts=self._uris, **client_kwargs)

            self._global_metadata_desc = global_metadata_desc

            self._index_kwargs = self._adapt_mapping_for_global_metadata()

            return True
        # connection failed exception handling
        except elasticsearch.NotFoundError as e:
            LOG.error(f'ElasticSearch sever with cloud id {cloud_id} and api key {api_key} does not exist')
            raise e
        except elasticsearch.AuthenticationException as e:
            LOG.error('ElasticSearch needs Authentication')
            raise e
        except elasticsearch.AuthorizationException as e:
            LOG.error('Unauthorized to access')
            raise e
        except Exception as e:
            LOG.error(f'Fail to connect ElasticSearch sever with cloud id {cloud_id} and api key {api_key}')
            raise e

    @override
    def _ensure_index(self, index: str = None) -> bool:
        if not index or self._client.indices.exists(index=index):
            return False
        try:
            self._client.indices.create(index=index, body=self._index_kwargs)
            return True
        except elasticsearch.TransportError as e:
            if getattr(e, 'error', '') != 'resource_already_exists_exception':
                raise e
        except Exception as e:
            LOG.error(f'[ElasticSearch - _ensure_index] Error creating index {index}: {e}')
            raise e

    @override
    def upsert(self, collection_name: str = None, data: List[Dict] = None) -> bool:
        """
Insert or update a batch of segment data into the Elasticsearch collection.
Args:
    collection_name (str): Collection name (per embed key grouping).
    data (List[dict]): List of segment data.
**Returns:**

    bool: True if successful, False otherwise.
"""
        if not data:
            return False
        try:
            self._ensure_index(collection_name)
            for i in range(0, len(data), INSERT_BATCH_SIZE):
                bulk_data = []
                batch_data = data[i: i + INSERT_BATCH_SIZE]
                for segment in batch_data:
                    segment = self._serialize_node(segment)
                    _id = segment.pop(self._primary_key, None)
                    bulk_data.append({'index': {'_index': collection_name, '_id': _id}})
                    bulk_data.append(segment)

                response = self._client.bulk(index=collection_name, body=bulk_data, refresh='wait_for')
                if response.get('errors'):
                    raise ValueError(
                        f'Error upserting data to Elasticsearch: {response}'
                    )
            return True

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - upsert] Error upserting documents to {collection_name}: {e}')
            raise e

    @override
    def delete(self, collection_name: str = None, criteria: Optional[Dict] = None, **kwargs) -> bool:
        """
Delete entire collection or subset of records by criteria.
Args:
    collection_name (str): Target collection.
    criteria (Optional[dict]): If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions).
**Returns:**

    bool: True if deletion succeeds, False otherwise.
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[ElasticSearchStore - delete] Index {collection_name} does not exist')
                return True
            if not criteria:
                with self._ddl_lock:
                    if self._client.indices.exists(index=collection_name):
                        self._client.indices.delete(index=collection_name)
                return True
            else:
                resp = self._client.delete_by_query(
                    index=collection_name,
                    body=self._construct_criteria(criteria),
                    refresh=True,
                    conflicts='proceed',
                    request_timeout=30,
                )

                if resp.get('version_conflicts', 0) > 0:
                    LOG.warning(f'[ElasticsearchStore - delete] Version conflicts: {resp.get("version_conflicts")}')
                if resp.get('failures'):
                    raise ValueError(
                        f'Error deleting data from Elasticsearch: {resp["failures"]}'
                    )
                return True

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - delete] Error deleting from {collection_name}: {e}')
            raise e

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """
Retrieve records matching primary-key or metadata filters.
Args:
    collection_name (str): Collection to query.
    criteria (Optional[dict]): Dict containing 'uid' list or metadata field filters.
**Returns:**

    List[dict]: List of segments with 'uid' and 'embedding'.
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                return []

            results: List[dict] = []
            criteria = dict(criteria) if criteria else {}
            limit = kwargs.get('limit')
            offset = max(kwargs.get('offset', 0) or 0, 0)
            return_total = kwargs.get('return_total', False)
            sort_by_number = kwargs.get('sort_by_number', False)
            # Query by primary key(mget)
            if criteria and self._primary_key in criteria:
                vals = criteria.pop(self._primary_key)
                if not isinstance(vals, list):
                    vals = [vals]

                resp = self._client.mget(index=collection_name, body={'ids': vals})

                for doc in resp['docs']:
                    if doc.get('found', False):
                        seg = self._transform_segment(doc)
                        if seg:
                            results.append(seg)
                if sort_by_number:
                    results = sorted(results, key=lambda item: (item.get('number', 0), item.get('uid', '')))
                total = len(results)
                if offset > 0 or limit is not None:
                    end = None if limit is None else offset + limit
                    results = results[offset:end]
                return (results, total) if return_total else results
            elif sort_by_number and (limit is not None or offset > 0 or return_total):
                body = self._construct_criteria(criteria) or {'query': {'match_all': {}}}
                body['sort'] = [{'number': {'order': 'asc'}}, {'_id': {'order': 'asc'}}]
                if offset > 0:
                    body['from'] = offset
                if limit is not None:
                    body['size'] = limit
                elif offset > 0:
                    body['size'] = 10000
                if return_total:
                    body['track_total_hits'] = True
                resp = self._client.search(index=collection_name, body=body)
                results = [self._transform_segment(hit) for hit in resp['hits']['hits']]
                total = resp['hits']['total']['value'] if return_total else len(results)
                return (results, total) if return_total else results

            else:
                helpers = elasticsearch.helpers
                query = self._construct_criteria(criteria)
                for hit in helpers.scan(
                    client=self._client,
                    index=collection_name,
                    query=query,  # 8.x need to wrap a query
                    scroll='2m',
                    size=500,
                    preserve_order=False,
                ):
                    seg = self._transform_segment(hit)
                    if seg:
                        results.append(seg)
            return results
        except Exception as e:
            LOG.error(f'[ElasticsearchStore - get] Error getting data from Elasticsearch: {e}')
            return []

    @override
    def search(self, collection_name: str, query: str,
               topk: Optional[int] = 10, filters: Optional[dict] = None, **kwargs) -> List[Dict]:  # noqa: C901
        """
Perform vector similarity search with optional metadata filtering.
Args:
    collection_name (str): Collection to search.
    query (Optional[str]): Query string.
    topk (Optional[int]): Number of nearest neighbors.
    filters (Optional[dict]): Metadata filter map.
    kwargs: Other search parameters

**Returns:**

- List[dict]: Return matching results list and similarity 'score'.
"""
        query_fields = ['*']
        try:
            self._ensure_index(collection_name)
            must_clauses = []
            es_query = {}
            text_query = {
                'multi_match': {
                    'query': query,
                    'fields': query_fields,
                }
            }
            must_clauses.append(text_query)

            filter_query = self._construct_criteria(filters) if filters else {}

            if must_clauses and filter_query:
                # combine filter_query and must_clauses
                filter_must = filter_query['query']['bool']['must']
                es_query = {'query': {'bool': {'must': must_clauses + filter_must}}}
            elif must_clauses:
                es_query = {'query': {'bool': {'must': must_clauses}}}
            elif filter_query:
                es_query = filter_query
            else:
                es_query = {'query': {'match_all': {}}}

            es_query['size'] = topk

            resp = self._client.search(index=collection_name, body=es_query)

            res = []
            for hit in resp['hits']['hits']:
                seg = self._transform_segment(hit)
                if seg:
                    seg['score'] = hit.get('_score', 0.0)
                    res.append(seg)
            return res

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - search] Error searching {collection_name}: {e}')
            return []

    def _serialize_node(self, segment: Dict) -> Dict:
        seg = dict(segment)
        seg.pop('embedding', None)

        # Note: Insertion will fail if a dictionary(meta) has an excessively deep nesting level or overly long keys.
        if self._global_metadata_desc and self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            seg['global_meta'] = json.dumps(seg.get('global_meta', {}), ensure_ascii=False)
            seg['meta'] = json.dumps(seg.get('meta', {}), ensure_ascii=False)
            seg['image_keys'] = json.dumps(seg.get('image_keys', []), ensure_ascii=False)
        return seg

    def _deserialize_node(self, segment: Dict) -> Dict:
        seg = dict(segment)
        if self._global_metadata_desc and self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            seg['meta'] = json.loads(seg.get('meta', '{}'))
            seg['global_meta'] = json.loads(seg.get('global_meta', '{}'))
            seg['image_keys'] = json.loads(seg.get('image_keys', '[]'))
        return seg

    def _construct_criteria(self, criteria: Optional[dict] = None) -> dict:  # noqa: C901
        criteria = dict(criteria) if criteria else {}
        if not criteria:
            return {}
        if self._primary_key in criteria:
            vals = criteria.pop(self._primary_key)
            if not isinstance(vals, list):
                vals = [vals]
            return {'query': {'ids': {'values': vals}}}

        exact_match_fields = {'doc_id', 'kb_id', 'group', 'parent'}

        def _add_clause(key, val):
            if key in exact_match_fields:
                clauses = []
                if isinstance(val, list):
                    clauses.append({'terms': {key: val}})
                    clauses.append({'terms': {f'{key}.keyword': val}})
                else:
                    clauses.append({'term': {key: val}})
                    clauses.append({'term': {f'{key}.keyword': val}})
                must_clauses.append({'bool': {'should': clauses, 'minimum_should_match': 1}})
                return
            if isinstance(val, list):
                must_clauses.append({'terms': {key: val}})
            else:
                must_clauses.append({'term': {key: val}})
        must_clauses = []
        if RAG_DOC_ID in criteria:
            val = criteria.pop(RAG_DOC_ID)
            _add_clause('doc_id', val)
        if RAG_KB_ID in criteria:
            _add_clause('kb_id', criteria.pop(RAG_KB_ID))
        if 'parent' in criteria:
            must_clauses.append({'term': {'parent': criteria.pop('parent')}})
        if 'number' in criteria:
            must_clauses.append({'term': {'number': criteria.pop('number')}})

        for k, v in criteria.items():
            field_key = k
            # For custom text fields, use .keyword subfield for exact matching
            if (self._global_metadata_desc
                and self._global_metadata_desc != BUILDIN_GLOBAL_META_DESC
                and k in self._global_metadata_desc.keys()
            ):
                field_desc = self._global_metadata_desc[k]

                if field_desc.data_type in (DataType.VARCHAR, DataType.STRING):
                    field_key = f'{k}.keyword'
            _add_clause(field_key, v)

        return {'query': {'bool': {'must': must_clauses}}} if must_clauses else {}

    def _transform_segment(self, record: dict) -> dict:
        src = record['_source']
        src['uid'] = record['_id']
        return self._deserialize_node(src)

    def _check_ik_plugin(self):
        try:
            plugins = self._client.cat.plugins(format='json')
            if any('analysis-ik' in p.get('component', '') for p in plugins):
                return True
            try:
                self._client.indices.analyze(
                    body={
                        'analyzer': 'ik_max_word',
                        'text': 'machine learning'
                    }
                )
                return True
            except Exception as e:
                LOG.warning(f'IK plugin is not installed: {str(e)}')
                return False
        except Exception as e:
            LOG.warning(f'check IK plugin failed: {e}')
            return False

    def _adapt_mapping_for_global_metadata(self) -> dict:
        check_ik = self._check_ik_plugin()
        if check_ik:
            LOG.info('IK plugin is installed')
        else:
            LOG.warning('IK plugin is not installed, ElasticSearch will \
                use ngram analyzer which is English Only Analyzer')

        if not self._global_metadata_desc or self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            mapping = copy.deepcopy(DEFAULT_MAPPING_BODY)
            if not check_ik:
                content_field = mapping['mappings']['properties'].get('content', {})
                if content_field.get('analyzer') == 'ik_max_word':
                    content_field['analyzer'] = 'ngram_analyzer'
                if content_field.get('search_analyzer') == 'ik_smart':
                    content_field['search_analyzer'] = 'ngram_analyzer'
                mapping['mappings']['properties']['content'] = content_field
            return mapping

        mapping = copy.deepcopy(DEFAULT_MAPPING_BODY)
        mapping['mappings']['dynamic'] = 'true'
        props = {'uid': {'type': 'keyword'}}
        self._type2es = {
            DataType.VARCHAR: 'text',
            DataType.ARRAY: 'array',
            DataType.INT32: 'integer',
            DataType.BOOLEAN: 'boolean',
            DataType.FLOAT: 'float',
            DataType.INT64: 'long',
            DataType.STRING: 'text',
        }

        for field_name, desc in self._global_metadata_desc.items():
            field_type = self._type2es[desc.data_type]
            field_def = {'type': field_type, 'store': True, 'index': True}
            if field_type == 'text':
                # Add keyword subfield for exact matching
                field_def['fields'] = {
                    'keyword': {
                        'type': 'keyword',
                        'ignore_above': 256
                    }
                }
                if check_ik:
                    field_def['analyzer'] = 'ik_max_word'
                    field_def['search_analyzer'] = 'ik_smart'
                else:
                    field_def['analyzer'] = 'ngram_analyzer'
                    field_def['search_analyzer'] = 'ngram_analyzer'
            props[field_name] = field_def
        mapping['mappings']['properties'] = props

        return mapping

`dir` `property`

Returns None when using remote Elasticsearch. Returns:

Optional[str]: None if remote.

`connect(global_metadata_desc=None, **kwargs)`

Initialize Elasticsearch client, pass in embedding model parameters and global metadata descriptions. Args: embed_dims (Dict[str, int]): Embedding dimensions per embed key. embed_datatypes (Dict[str, DataType]): Data types for each embed key. global_metadata_desc (Dict[str, GlobalMetadataDesc]): Descriptions for metadata fields. Returns:

bool: True if successful, False otherwise.

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

    @override
    def connect(self, global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs) -> bool:
        """
Initialize Elasticsearch client, pass in embedding model parameters and global metadata descriptions.
Args:
    embed_dims (Dict[str, int]): Embedding dimensions per embed key.
    embed_datatypes (Dict[str, DataType]): Data types for each embed key.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Descriptions for metadata fields.
**Returns:**

    bool: True if successful, False otherwise.
"""
        try:
            self._ddl_lock = threading.Lock()
            # Elastic Cloud
            if self._client_kwargs.get('cloud_id') and self._client_kwargs.get('api_key'):
                cloud_id = self._client_kwargs.get('cloud_id')
                api_key = self._client_kwargs.get('api_key')
                request_timeout = self._client_kwargs.get('request_timeout', 30)

                self._client = elasticsearch.Elasticsearch(
                    cloud_id=cloud_id, api_key=api_key, request_timeout=request_timeout
                )
                if not self._client.ping():
                    raise ConnectionError(f'Failed to ping ES {self._uris}')
            else:
                client_kwargs = dict(self._client_kwargs)
                if 'request_timeout' not in client_kwargs:
                    client_kwargs['request_timeout'] = 30
                self._client = elasticsearch.Elasticsearch(hosts=self._uris, **client_kwargs)

            self._global_metadata_desc = global_metadata_desc

            self._index_kwargs = self._adapt_mapping_for_global_metadata()

            return True
        # connection failed exception handling
        except elasticsearch.NotFoundError as e:
            LOG.error(f'ElasticSearch sever with cloud id {cloud_id} and api key {api_key} does not exist')
            raise e
        except elasticsearch.AuthenticationException as e:
            LOG.error('ElasticSearch needs Authentication')
            raise e
        except elasticsearch.AuthorizationException as e:
            LOG.error('Unauthorized to access')
            raise e
        except Exception as e:
            LOG.error(f'Fail to connect ElasticSearch sever with cloud id {cloud_id} and api key {api_key}')
            raise e

`delete(collection_name=None, criteria=None, **kwargs)`

Delete entire collection or subset of records by criteria. Args: collection_name (str): Target collection. criteria (Optional[dict]): If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions). Returns:

bool: True if deletion succeeds, False otherwise.

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

    @override
    def delete(self, collection_name: str = None, criteria: Optional[Dict] = None, **kwargs) -> bool:
        """
Delete entire collection or subset of records by criteria.
Args:
    collection_name (str): Target collection.
    criteria (Optional[dict]): If None, drop the entire collection; otherwise a dict of filters (uid list or metadata conditions).
**Returns:**

    bool: True if deletion succeeds, False otherwise.
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[ElasticSearchStore - delete] Index {collection_name} does not exist')
                return True
            if not criteria:
                with self._ddl_lock:
                    if self._client.indices.exists(index=collection_name):
                        self._client.indices.delete(index=collection_name)
                return True
            else:
                resp = self._client.delete_by_query(
                    index=collection_name,
                    body=self._construct_criteria(criteria),
                    refresh=True,
                    conflicts='proceed',
                    request_timeout=30,
                )

                if resp.get('version_conflicts', 0) > 0:
                    LOG.warning(f'[ElasticsearchStore - delete] Version conflicts: {resp.get("version_conflicts")}')
                if resp.get('failures'):
                    raise ValueError(
                        f'Error deleting data from Elasticsearch: {resp["failures"]}'
                    )
                return True

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - delete] Error deleting from {collection_name}: {e}')
            raise e

`get(collection_name, criteria=None, **kwargs)`

Retrieve records matching primary-key or metadata filters. Args: collection_name (str): Collection to query. criteria (Optional[dict]): Dict containing 'uid' list or metadata field filters. Returns:

List[dict]: List of segments with 'uid' and 'embedding'.

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """
Retrieve records matching primary-key or metadata filters.
Args:
    collection_name (str): Collection to query.
    criteria (Optional[dict]): Dict containing 'uid' list or metadata field filters.
**Returns:**

    List[dict]: List of segments with 'uid' and 'embedding'.
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                return []

            results: List[dict] = []
            criteria = dict(criteria) if criteria else {}
            limit = kwargs.get('limit')
            offset = max(kwargs.get('offset', 0) or 0, 0)
            return_total = kwargs.get('return_total', False)
            sort_by_number = kwargs.get('sort_by_number', False)
            # Query by primary key(mget)
            if criteria and self._primary_key in criteria:
                vals = criteria.pop(self._primary_key)
                if not isinstance(vals, list):
                    vals = [vals]

                resp = self._client.mget(index=collection_name, body={'ids': vals})

                for doc in resp['docs']:
                    if doc.get('found', False):
                        seg = self._transform_segment(doc)
                        if seg:
                            results.append(seg)
                if sort_by_number:
                    results = sorted(results, key=lambda item: (item.get('number', 0), item.get('uid', '')))
                total = len(results)
                if offset > 0 or limit is not None:
                    end = None if limit is None else offset + limit
                    results = results[offset:end]
                return (results, total) if return_total else results
            elif sort_by_number and (limit is not None or offset > 0 or return_total):
                body = self._construct_criteria(criteria) or {'query': {'match_all': {}}}
                body['sort'] = [{'number': {'order': 'asc'}}, {'_id': {'order': 'asc'}}]
                if offset > 0:
                    body['from'] = offset
                if limit is not None:
                    body['size'] = limit
                elif offset > 0:
                    body['size'] = 10000
                if return_total:
                    body['track_total_hits'] = True
                resp = self._client.search(index=collection_name, body=body)
                results = [self._transform_segment(hit) for hit in resp['hits']['hits']]
                total = resp['hits']['total']['value'] if return_total else len(results)
                return (results, total) if return_total else results

            else:
                helpers = elasticsearch.helpers
                query = self._construct_criteria(criteria)
                for hit in helpers.scan(
                    client=self._client,
                    index=collection_name,
                    query=query,  # 8.x need to wrap a query
                    scroll='2m',
                    size=500,
                    preserve_order=False,
                ):
                    seg = self._transform_segment(hit)
                    if seg:
                        results.append(seg)
            return results
        except Exception as e:
            LOG.error(f'[ElasticsearchStore - get] Error getting data from Elasticsearch: {e}')
            return []

`search(collection_name, query, topk=10, filters=None, **kwargs)`

Perform vector similarity search with optional metadata filtering. Args: collection_name (str): Collection to search. query (Optional[str]): Query string. topk (Optional[int]): Number of nearest neighbors. filters (Optional[dict]): Metadata filter map. kwargs: Other search parameters

Returns:

List[dict]: Return matching results list and similarity 'score'.

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

    @override
    def search(self, collection_name: str, query: str,
               topk: Optional[int] = 10, filters: Optional[dict] = None, **kwargs) -> List[Dict]:  # noqa: C901
        """
Perform vector similarity search with optional metadata filtering.
Args:
    collection_name (str): Collection to search.
    query (Optional[str]): Query string.
    topk (Optional[int]): Number of nearest neighbors.
    filters (Optional[dict]): Metadata filter map.
    kwargs: Other search parameters

**Returns:**

- List[dict]: Return matching results list and similarity 'score'.
"""
        query_fields = ['*']
        try:
            self._ensure_index(collection_name)
            must_clauses = []
            es_query = {}
            text_query = {
                'multi_match': {
                    'query': query,
                    'fields': query_fields,
                }
            }
            must_clauses.append(text_query)

            filter_query = self._construct_criteria(filters) if filters else {}

            if must_clauses and filter_query:
                # combine filter_query and must_clauses
                filter_must = filter_query['query']['bool']['must']
                es_query = {'query': {'bool': {'must': must_clauses + filter_must}}}
            elif must_clauses:
                es_query = {'query': {'bool': {'must': must_clauses}}}
            elif filter_query:
                es_query = filter_query
            else:
                es_query = {'query': {'match_all': {}}}

            es_query['size'] = topk

            resp = self._client.search(index=collection_name, body=es_query)

            res = []
            for hit in resp['hits']['hits']:
                seg = self._transform_segment(hit)
                if seg:
                    seg['score'] = hit.get('_score', 0.0)
                    res.append(seg)
            return res

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - search] Error searching {collection_name}: {e}')
            return []

`upsert(collection_name=None, data=None)`

Insert or update a batch of segment data into the Elasticsearch collection. Args: collection_name (str): Collection name (per embed key grouping). data (List[dict]): List of segment data. Returns:

bool: True if successful, False otherwise.

Source code in lazyllm/tools/rag/store/segment/elasticsearch_store.py

    @override
    def upsert(self, collection_name: str = None, data: List[Dict] = None) -> bool:
        """
Insert or update a batch of segment data into the Elasticsearch collection.
Args:
    collection_name (str): Collection name (per embed key grouping).
    data (List[dict]): List of segment data.
**Returns:**

    bool: True if successful, False otherwise.
"""
        if not data:
            return False
        try:
            self._ensure_index(collection_name)
            for i in range(0, len(data), INSERT_BATCH_SIZE):
                bulk_data = []
                batch_data = data[i: i + INSERT_BATCH_SIZE]
                for segment in batch_data:
                    segment = self._serialize_node(segment)
                    _id = segment.pop(self._primary_key, None)
                    bulk_data.append({'index': {'_index': collection_name, '_id': _id}})
                    bulk_data.append(segment)

                response = self._client.bulk(index=collection_name, body=bulk_data, refresh='wait_for')
                if response.get('errors'):
                    raise ValueError(
                        f'Error upserting data to Elasticsearch: {response}'
                    )
            return True

        except Exception as e:
            LOG.error(f'[ElasticSearchStore - upsert] Error upserting documents to {collection_name}: {e}')
            raise e

`lazyllm.tools.rag.readers.ReaderBase`

Bases: ModuleBase

Base document reader class that provides fundamental interfaces for document loading. Inherits from ModuleBase and uses LazyLLMRegisterMetaClass as metaclass.

Parameters:

*args –

Positional arguments, reserved for parent or subclass use.
return_trace (bool, default: True ) –

Whether to return processing trace information. Defaults to True.
**kwargs –

Keyword arguments, reserved for parent or subclass use.

Examples:

from lazyllm.tools.rag.readers.readerBase import LazyLLMReaderBase
from lazyllm.tools.rag.doc_node import DocNode
from typing import Iterable

class CustomReader(LazyLLMReaderBase):
    def _lazy_load_data(self, file_paths: list, **kwargs) -> Iterable[DocNode]:
        for file_path in file_paths:
            # Process each file and yield DocNode
            content = self._read_file(file_path)
            yield DocNode(
                text=content,
                metadata={"source": file_path}
            )

# Create reader instance
reader = CustomReader(return_trace=True)

# Load documents
documents = reader.forward(file_paths=["doc1.txt", "doc2.txt"])

Source code in lazyllm/tools/rag/readers/readerBase.py

class LazyLLMReaderBase(ModuleBase, metaclass=LazyLLMRegisterMetaClass):
    """
Base document reader class that provides fundamental interfaces for document loading. Inherits from ModuleBase and uses LazyLLMRegisterMetaClass as metaclass.

Args:
    *args: Positional arguments, reserved for parent or subclass use.
    return_trace (bool): Whether to return processing trace information. Defaults to True.
    **kwargs: Keyword arguments, reserved for parent or subclass use.


Examples:

    from lazyllm.tools.rag.readers.readerBase import LazyLLMReaderBase
    from lazyllm.tools.rag.doc_node import DocNode
    from typing import Iterable

    class CustomReader(LazyLLMReaderBase):
        def _lazy_load_data(self, file_paths: list, **kwargs) -> Iterable[DocNode]:
            for file_path in file_paths:
                # Process each file and yield DocNode
                content = self._read_file(file_path)
                yield DocNode(
                    text=content,
                    metadata={"source": file_path}
                )

    # Create reader instance
    reader = CustomReader(return_trace=True)

    # Load documents
    documents = reader.forward(file_paths=["doc1.txt", "doc2.txt"])
    """
    post_action = None

    _encoding_cache = {}
    _cache_lock = threading.Lock()
    _cache_max_size = 1000

    def __init__(self, *args, return_trace: bool = True, **kwargs):
        super().__init__(return_trace=return_trace)

    def _lazy_load_data(self, *args, **load_kwargs) -> Iterable[DocNode]:
        raise NotImplementedError(f'{self.__class__.__name__} does not implement lazy_load_data method.')

    def _load_data(self, *args, **load_kwargs) -> List[DocNode]:
        return list(self._lazy_load_data(*args, **load_kwargs))

    def forward(self, *args, **kwargs) -> List[DocNode]:
        r = self._load_data(*args, **kwargs)
        r = [r] if isinstance(r, DocNode) else [] if r is None else r
        if r and self.post_action:
            r = [x for sub in [self.post_action(n) for n in r] for x in (sub if isinstance(sub, list) else [sub])]
        return r

    @classmethod
    def detect_encoding(cls, file_path: Union[str, Path], fs: Optional['fsspec.AbstractFileSystem'] = None,  # noqa: C901
                        sample_size: int = 10000, use_cache: bool = True,
                        enable_chardet: bool = True) -> str:
        """Detect the encoding of a file.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.

**Returns:**

- str: The encoding of the file.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> encoding = reader.detect_encoding("path/to/file.txt")
    >>> print(encoding)
    """
        if not isinstance(file_path, Path):
            file_path = Path(file_path)

        fs = fs or get_default_fs()

        cache_key = str(file_path) if use_cache else None
        if cache_key:
            with cls._cache_lock:
                if cache_key in cls._encoding_cache:
                    cached_encoding = cls._encoding_cache[cache_key]
                    return cached_encoding

        try:
            with fs.open(file_path, 'rb') as f:
                raw_data = f.read(sample_size)
        except Exception as e:
            LOG.warning(f'Failed to read file {file_path}: {e}')
            return 'utf-8'

        if not raw_data:
            return 'utf-8'

        bom_encodings = [
            (b'\xef\xbb\xbf', 'utf-8-sig'),
            (b'\xff\xfe\x00\x00', 'utf-32-le'),
            (b'\x00\x00\xfe\xff', 'utf-32-be'),
            (b'\xff\xfe', 'utf-16-le'),
            (b'\xfe\xff', 'utf-16-be'),
        ]

        for bom, encoding in bom_encodings:
            if raw_data.startswith(bom):
                cls._cache_encoding(cache_key, encoding)
                return encoding

        has_high_bytes = any(b > 127 for b in raw_data[:1000])

        if has_high_bytes:
            chinese_encodings = ['gb18030', 'gbk', 'gb2312', 'big5']
            # Prefer UTF-8 when valid; otherwise fall back to Chinese encodings.
            # Do not require Chinese chars in the first N chars — CSV headers are often long ASCII.
            if cls._try_decode(raw_data, 'utf-8'):
                cls._cache_encoding(cache_key, 'utf-8')
                return 'utf-8'

            for encoding in chinese_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding
        else:
            primary_encodings = ['utf-8', 'gb18030', 'gbk', 'gb2312', 'big5']
            for encoding in primary_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding

        if cls._try_decode(raw_data, 'latin-1'):
            cls._cache_encoding(cache_key, 'latin-1')
            return 'latin-1'

        if enable_chardet:
            try:
                detected = charset_normalizer.from_path(file_path).best().encoding
                if detected:
                    cls._cache_encoding(cache_key, detected)
                    return detected
                else:
                    LOG.warning(f'Charset normalizer detection failed: {detected}')
            except Exception as e:
                LOG.warning(f'Charset normalizer detection failed: {e}')

        try:
            system_encoding = locale.getpreferredencoding(False)
            LOG.warning(f'Using system default encoding {system_encoding} for {file_path}')
            cls._cache_encoding(cache_key, system_encoding)
            return system_encoding
        except Exception:
            pass
        LOG.warning(f'Could not detect encoding for {file_path}, using utf-8 as fallback')
        cls._cache_encoding(cache_key, 'utf-8')
        return 'utf-8'

    @staticmethod
    def _try_decode(data: bytes, encoding: str) -> bool:
        try:
            data.decode(encoding)
            return True
        except (UnicodeDecodeError, LookupError):
            return False

    @classmethod
    def _cache_encoding(cls, cache_key: Optional[str], encoding: str) -> None:
        if cache_key is None:
            return

        with cls._cache_lock:
            if len(cls._encoding_cache) >= cls._cache_max_size:
                old_keys = list(cls._encoding_cache.keys())[:100]
                for key in old_keys:
                    del cls._encoding_cache[key]
                LOG.debug(f'Encoding cache cleaned: removed {len(old_keys)} entries')

            cls._encoding_cache[cache_key] = encoding

    @classmethod
    def clear_encoding_cache(cls) -> None:
        """Clear the encoding cache.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> reader.clear_encoding_cache()
    """
        with cls._cache_lock:
            cls._encoding_cache.clear()

    @classmethod
    def get_encoding_cache_stats(cls) -> dict:
        """Get the encoding cache stats.

**Returns:**

- dict: The encoding cache stats.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> stats = reader.get_encoding_cache_stats()
    >>> print(stats)
    """
        with cls._cache_lock:
            return {
                'cache_size': len(cls._encoding_cache),
                'cache_max_size': cls._cache_max_size,
                'usage_ratio': len(cls._encoding_cache) / cls._cache_max_size if cls._cache_max_size > 0 else 0
            }

`clear_encoding_cache()` `classmethod`

Clear the encoding cache.

Parameters:

file_path (str) –

The path of the file.
fs (AbstractFileSystem) –

The file system.
sample_size (int) –

The sample size.
use_cache (bool) –

Whether to use cache.
enable_chardet (bool) –

Whether to enable chardet.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> reader.clear_encoding_cache()

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def clear_encoding_cache(cls) -> None:
        """Clear the encoding cache.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> reader.clear_encoding_cache()
    """
        with cls._cache_lock:
            cls._encoding_cache.clear()

`detect_encoding(file_path, fs=None, sample_size=10000, use_cache=True, enable_chardet=True)` `classmethod`

Detect the encoding of a file.

Parameters:

file_path (str) –

The path of the file.
fs (AbstractFileSystem, default: None ) –

The file system.
sample_size (int, default: 10000 ) –

The sample size.
use_cache (bool, default: True ) –

Whether to use cache.
enable_chardet (bool, default: True ) –

Whether to enable chardet.

Returns:

str: The encoding of the file.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> encoding = reader.detect_encoding("path/to/file.txt")
>>> print(encoding)

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def detect_encoding(cls, file_path: Union[str, Path], fs: Optional['fsspec.AbstractFileSystem'] = None,  # noqa: C901
                        sample_size: int = 10000, use_cache: bool = True,
                        enable_chardet: bool = True) -> str:
        """Detect the encoding of a file.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.

**Returns:**

- str: The encoding of the file.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> encoding = reader.detect_encoding("path/to/file.txt")
    >>> print(encoding)
    """
        if not isinstance(file_path, Path):
            file_path = Path(file_path)

        fs = fs or get_default_fs()

        cache_key = str(file_path) if use_cache else None
        if cache_key:
            with cls._cache_lock:
                if cache_key in cls._encoding_cache:
                    cached_encoding = cls._encoding_cache[cache_key]
                    return cached_encoding

        try:
            with fs.open(file_path, 'rb') as f:
                raw_data = f.read(sample_size)
        except Exception as e:
            LOG.warning(f'Failed to read file {file_path}: {e}')
            return 'utf-8'

        if not raw_data:
            return 'utf-8'

        bom_encodings = [
            (b'\xef\xbb\xbf', 'utf-8-sig'),
            (b'\xff\xfe\x00\x00', 'utf-32-le'),
            (b'\x00\x00\xfe\xff', 'utf-32-be'),
            (b'\xff\xfe', 'utf-16-le'),
            (b'\xfe\xff', 'utf-16-be'),
        ]

        for bom, encoding in bom_encodings:
            if raw_data.startswith(bom):
                cls._cache_encoding(cache_key, encoding)
                return encoding

        has_high_bytes = any(b > 127 for b in raw_data[:1000])

        if has_high_bytes:
            chinese_encodings = ['gb18030', 'gbk', 'gb2312', 'big5']
            # Prefer UTF-8 when valid; otherwise fall back to Chinese encodings.
            # Do not require Chinese chars in the first N chars — CSV headers are often long ASCII.
            if cls._try_decode(raw_data, 'utf-8'):
                cls._cache_encoding(cache_key, 'utf-8')
                return 'utf-8'

            for encoding in chinese_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding
        else:
            primary_encodings = ['utf-8', 'gb18030', 'gbk', 'gb2312', 'big5']
            for encoding in primary_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding

        if cls._try_decode(raw_data, 'latin-1'):
            cls._cache_encoding(cache_key, 'latin-1')
            return 'latin-1'

        if enable_chardet:
            try:
                detected = charset_normalizer.from_path(file_path).best().encoding
                if detected:
                    cls._cache_encoding(cache_key, detected)
                    return detected
                else:
                    LOG.warning(f'Charset normalizer detection failed: {detected}')
            except Exception as e:
                LOG.warning(f'Charset normalizer detection failed: {e}')

        try:
            system_encoding = locale.getpreferredencoding(False)
            LOG.warning(f'Using system default encoding {system_encoding} for {file_path}')
            cls._cache_encoding(cache_key, system_encoding)
            return system_encoding
        except Exception:
            pass
        LOG.warning(f'Could not detect encoding for {file_path}, using utf-8 as fallback')
        cls._cache_encoding(cache_key, 'utf-8')
        return 'utf-8'

`get_encoding_cache_stats()` `classmethod`

Get the encoding cache stats.

Returns:

dict: The encoding cache stats.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> stats = reader.get_encoding_cache_stats()
>>> print(stats)

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def get_encoding_cache_stats(cls) -> dict:
        """Get the encoding cache stats.

**Returns:**

- dict: The encoding cache stats.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> stats = reader.get_encoding_cache_stats()
    >>> print(stats)
    """
        with cls._cache_lock:
            return {
                'cache_size': len(cls._encoding_cache),
                'cache_max_size': cls._cache_max_size,
                'usage_ratio': len(cls._encoding_cache) / cls._cache_max_size if cls._cache_max_size > 0 else 0
            }

`lazyllm.tools.rag.readers.PandasCSVReader`

Bases: LazyLLMReaderBase

Reader for parsing CSV files using pandas.

Parameters:

concat_rows (bool, default: True ) –

Whether to concatenate all rows into a single text block. Default is True.
col_joiner (str, default: ', ' ) –

String used to join column values.
row_joiner (str, default: '\n' ) –

String used to join rows.
pandas_config (Optional[Dict], default: None ) –

Optional config for pandas.read_csv.
fill_method (Optional[str], default: 'fillna' ) –

Missing value fill strategy: 'fillna'(default) / 'ffill' / 'bfill'.
return_trace (bool, default: True ) –

Whether to return the processing trace.

Source code in lazyllm/tools/rag/readers/pandasReader.py

class PandasCSVReader(LazyLLMReaderBase):
    """Reader for parsing CSV files using pandas.

Args:
    concat_rows (bool): Whether to concatenate all rows into a single text block. Default is True.
    col_joiner (str): String used to join column values.
    row_joiner (str): String used to join rows.
    pandas_config (Optional[Dict]): Optional config for pandas.read_csv.
    fill_method (Optional[str]): Missing value fill strategy: 'fillna'(default) / 'ffill' / 'bfill'.
    return_trace (bool): Whether to return the processing trace.
"""
    def __init__(self, concat_rows: bool = True, col_joiner: str = ', ', row_joiner: str = '\n',
                 pandas_config: Optional[Dict] = None, fill_method: Optional[str] = 'fillna',
                 return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._concat_rows = concat_rows
        self._col_joiner = col_joiner
        self._row_joiner = row_joiner
        self._pandas_config = pandas_config or {}
        self._fill_method = fill_method

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        if fs:
            with fs.open(file) as f:
                df = _read_csv_auto(f, file_path=file, fs=fs, **self._pandas_config)
        else:
            df = _read_csv_auto(file, file_path=file, **self._pandas_config)

        df = _apply_fill(df, self._fill_method)
        text_list = df.apply(lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1).tolist()

        if self._concat_rows: return [DocNode(text=(self._row_joiner).join(text_list))]
        else: return [DocNode(text=text) for text in text_list]

`lazyllm.tools.rag.readers.PandasExcelReader`

Bases: LazyLLMReaderBase

Reader for extracting text content from Excel (.xlsx) files.

Parameters:

concat_rows (bool, default: True ) –

Whether to concatenate all rows into a single block.
sheet_name (Optional[str], default: None ) –

Name of the sheet to read. If None, all sheets will be read.
pandas_config (Optional[Dict], default: None ) –

Optional config for pandas.read_excel.
fill_method (Optional[str], default: 'fillna' ) –

Missing value fill strategy: 'fillna'(default) / 'ffill' / 'bfill'.
return_trace (bool, default: True ) –

Whether to return the processing trace.

Source code in lazyllm/tools/rag/readers/pandasReader.py

class PandasExcelReader(LazyLLMReaderBase):
    """Reader for extracting text content from Excel (.xlsx) files.

Args:
    concat_rows (bool): Whether to concatenate all rows into a single block.
    sheet_name (Optional[str]): Name of the sheet to read. If None, all sheets will be read.
    pandas_config (Optional[Dict]): Optional config for pandas.read_excel.
    fill_method (Optional[str]): Missing value fill strategy: 'fillna'(default) / 'ffill' / 'bfill'.
    return_trace (bool): Whether to return the processing trace.
"""
    def __init__(self, concat_rows: bool = True, sheet_name: Optional[str] = None,
                 pandas_config: Optional[Dict] = None, fill_method: Optional[str] = 'fillna',
                 return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._concat_rows = concat_rows
        self._sheet_name = sheet_name
        self._pandas_config = pandas_config or {}
        self._fill_method = fill_method

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        openpyxl_spec = importlib.util.find_spec('openpyxl')
        if openpyxl_spec is not None: pass
        else: raise ImportError('Please install openpyxl to read Excel files. '
                                'You can install it with `pip install openpyxl`')

        if not isinstance(file, Path): file = Path(file)
        if fs:
            with fs.open(file) as f:
                dfs = pd.read_excel(f, self._sheet_name, **self._pandas_config)
        else:
            dfs = pd.read_excel(file, self._sheet_name, **self._pandas_config)

        def process_df(df: pd.DataFrame) -> List[DocNode]:
            df = _apply_fill(df, self._fill_method)
            text_list = (df.astype(str).apply(lambda row: ' '.join(row.values), axis=1).tolist())

            if self._concat_rows:
                return [DocNode(text='\n'.join(text_list))]
            return [DocNode(text=text) for text in text_list]

        dfs_list = [dfs] if isinstance(dfs, pd.DataFrame) else dfs.values()
        return [doc for df in dfs_list for doc in process_df(df)]

`lazyllm.tools.rag.readers.PDFReader`

Bases: _RichReader

Reader for extracting text content from PDF files.

Parameters:

split_doc (bool, default: True ) –

If True (default), parses into a RichDocNode which can be used with RichTransform to extract nodes with page information; if False, parses into a plain text DocNode.
post_func (Optional[Callable[[List[DocNode]], List[DocNode]]], default: None ) –

Post-processing function. Must return a List[DocNode] and will write extra_info into each node's global_metadata.
return_trace (bool, default: True ) –

Whether to return the processing trace. Default is True.
return_full_document ((bool, deprecated), default: None ) –

This parameter will be removed in a future version. Please use split_doc instead.

Notes

When split_doc=True, returns a RichDocNode; otherwise returns a DocNode. Both cases return a single node. When split_doc=True, it is strongly recommended to use it with RichTransform, which can extract nodes with page information and other metadata; without RichTransform, the parsed nodes will fall back to plain text nodes.

Source code in lazyllm/tools/rag/readers/pdfReader.py

class PDFReader(_RichReader):
    """Reader for extracting text content from PDF files.

Args:
    split_doc (bool): If True (default), parses into a `RichDocNode` which can be used with `RichTransform` to extract nodes with page information;
        if False, parses into a plain text `DocNode`.
    post_func (Optional[Callable[[List[DocNode]], List[DocNode]]]): Post-processing function.
        Must return a ``List[DocNode]`` and will write ``extra_info`` into each node's ``global_metadata``.
    return_trace (bool): Whether to return the processing trace. Default is True.
    return_full_document (bool, deprecated): This parameter will be removed in a future version. Please use `split_doc` instead.

Notes:
    When `split_doc=True`, returns a `RichDocNode`; otherwise returns a `DocNode`. Both cases return a single node.
    When `split_doc=True`, it is strongly recommended to use it with `RichTransform`, which can extract nodes with page information and other metadata;
    without `RichTransform`, the parsed nodes will fall back to plain text nodes.
"""
    def __init__(self, split_doc: bool = True,
                 post_func: Optional[Callable[[List[DocNode]], List[DocNode]]] = None,
                 return_trace: bool = True, *, return_full_document=None) -> None:
        if return_full_document is not None:
            LOG.warning('return_full_document is deprecated, please use split_doc instead')
            assert split_doc ^ return_full_document, \
                'split_doc and return_full_document cannot be both True or False'
            split_doc = not return_full_document
        super().__init__(post_func=post_func, split_doc=split_doc, return_trace=return_trace)

    @retry(stop_after_attempt=3)
    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        fs = fs or get_default_fs()
        with fs.open(file, 'rb') as fp:
            stream = fp if is_default_fs(fs) else io.BytesIO(fp.read())
            pdf = pypdf.PdfReader(stream)
            num_pages = len(pdf.pages)
            docs = []
            for page in range(num_pages):
                page_text = pdf.pages[page].extract_text()
                page_label = pdf.page_labels[page]
                metadata = {'page_label': page_label}
                docs.append(DocNode(text=page_text, metadata=metadata))
            return docs

`lazyllm.tools.rag.readers.PPTXReader`

Bases: LazyLLMReaderBase

Reader for PPTX (PowerPoint) files. Extracts text from slides and generates captions for embedded images using a vision-language model.

Parameters:

return_trace (bool, default: True ) –

Whether to record the processing trace. Default is True.

Source code in lazyllm/tools/rag/readers/pptxReader.py

class PPTXReader(LazyLLMReaderBase):
    """Reader for PPTX (PowerPoint) files. Extracts text from slides and generates captions for embedded images using a vision-language model.

Args:
    return_trace (bool): Whether to record the processing trace. Default is True.
"""
    def __init__(self, return_trace: bool = True) -> None:
        try:
            thirdparty.check_packages(['python-pptx', 'torch', 'Pillow', 'transformers'])
        except ImportError:
            raise ImportError('Please install extra dependencies that are required for the '
                              'PPTXReader: `pip install torch transformers python-pptx Pillow`')

        super().__init__(return_trace=return_trace)
        model = tf.VisionEncoderDecoderModel.from_pretrained('nlpconnect/vit-gpt2-image-captioning')
        feature_extractor = tf.ViTFeatureExtractor.from_pretrained('nlpconnect/vit-gpt2-image-captioning')
        tokenizer = tf.AutoTokenizer.from_pretrained('nlpconnect/vit-gpt2-image-captioning')

        self._parser_config = {'feature_extractor': feature_extractor, 'model': model, 'tokenizer': tokenizer}

    def _caption_image(self, tmp_image_file: str) -> str:
        from PIL import Image

        model = self._parser_config['model']
        feature_extractor = self._parser_config['feature_extractor']
        tokenizer = self._parser_config['tokenizer']

        device = infer_torch_device()
        model.to(device)

        max_length = 16
        num_beams = 4
        gen_kwargs = {'max_length': max_length, 'num_beams': num_beams}

        i_image = Image.open(tmp_image_file)
        if i_image.mode != 'RGB': i_image = i_image.convert(mode='RGB')

        pixel_values = feature_extractor(images=[i_image], return_tensors='pt').pixel_values
        pixel_values = pixel_values.to(device)

        output_ids = model.generate(pixel_values, **gen_kwargs)

        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        return preds[0].strip()

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        if fs:
            with fs.open(file) as f:
                presentation = pptx.Presentation(f)
        else:
            presentation = pptx.Presentation(file)

        result = ''
        for i, slide in enumerate(presentation.slides):
            result += f'\n\nSlide #{i}: \n'
            for shape in slide.shapes:
                if hasattr(shape, 'image'):
                    image = shape.image
                    image_bytes = image.blob
                    f = tempfile.NamedTemporaryFile('wb', delete=False)
                    try:
                        f.write(image_bytes)
                        f.close()
                        result += f'\n Image: {self._caption_image(f.name)}\n\n'
                    finally:
                        os.unlink(f.name)

                if hasattr(shape, 'text'): result += f'{shape.text}\n'
        return [DocNode(text=result)]

`lazyllm.tools.rag.readers.VideoAudioReader`

Bases: LazyLLMReaderBase

Reader for extracting speech content from video or audio files using OpenAI's Whisper model for transcription.

Parameters:

model_version (str, default: 'base' ) –

Whisper model version (e.g., "base", "small", "medium", "large"). Default is "base".
return_trace (bool, default: True ) –

Whether to return the processing trace. Default is True.

Source code in lazyllm/tools/rag/readers/videoAudioReader.py

class VideoAudioReader(LazyLLMReaderBase):
    """Reader for extracting speech content from video or audio files using OpenAI's Whisper model for transcription.

Args:
    model_version (str): Whisper model version (e.g., "base", "small", "medium", "large"). Default is "base".
    return_trace (bool): Whether to return the processing trace. Default is True.
"""
    def __init__(self, model_version: str = 'base', return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._model_version = model_version

        try:
            import whisper
        except ImportError:
            raise ImportError('Please install OpenAI whisper model '
                              '`pip install openai-whisper` to use the model')

        model = whisper.load_model(self._model_version)
        self._parser_config = {'model': model}

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        import whisper

        if not isinstance(file, Path): file = Path(file)

        if file.name.endswith('mp4'):
            try:
                from pydub import AudioSegment
            except ImportError:
                raise ImportError('Please install pydub `pip install pydub`')

            if fs:
                with fs.open(file, 'rb') as f:
                    video = AudioSegment.from_file(f, format='mp4')
            else:
                video = AudioSegment.from_file(file, format='mp4')

            audio = video.split_to_mono()[0]
            file_str = str(file)[:-4] + '.mp3'
            audio.export(file_str, format='mp3')

        model = cast(whisper.Whisper, self._parser_config['model'])
        result = model.transcribe(str(file))

        transcript = result['text']
        return [DocNode(text=transcript)]

`lazyllm.tools.SqlManager`

Bases: DBManager

SqlManager is a specialized tool for interacting with databases. It provides methods for creating tables, executing queries, and performing updates on databases.

Parameters:

db_type (str) –

Database type, supports: postgresql, mysql, mssql, sqlite, mysql+pymysql
user (str) –

Database username
password (str) –

Database password
host (str) –

Database host address
port (int) –

Database port number
db_name (str) –

Database name
options_str (str, default: None ) –

Connection options string, defaults to None
tables_info_dict (Dict, default: None ) –

Table structure information dictionary for initializing table structure, defaults to None

Source code in lazyllm/tools/sql/sql_manager.py

class SqlManager(DBManager):
    """SqlManager is a specialized tool for interacting with databases.
It provides methods for creating tables, executing queries, and performing updates on databases.

Args:
    db_type (str): Database type, supports: postgresql, mysql, mssql, sqlite, mysql+pymysql
    user (str): Database username
    password (str): Database password
    host (str): Database host address
    port (int): Database port number
    db_name (str): Database name
    options_str (str, optional): Connection options string, defaults to None
    tables_info_dict (Dict, optional): Table structure information dictionary for initializing table structure, defaults to None
"""
    DB_TYPE_SUPPORTED = set(['postgresql', 'mysql', 'mssql', 'sqlite', 'mysql+pymysql', 'tidb'])
    DB_DRIVER_MAP = {'mysql': 'pymysql', 'tidb': 'pymysql'}
    PYTYPE_TO_SQL_MAP = {
        'integer': sqlalchemy.Integer,
        'string': sqlalchemy.Text,
        'text': sqlalchemy.Text,
        'boolean': sqlalchemy.Boolean,
        'float': sqlalchemy.Float,
        'datetime': sqlalchemy.DateTime,
        'bytes': sqlalchemy.LargeBinary,
        'bool': sqlalchemy.Boolean,
        'date': sqlalchemy.Date,
        'time': sqlalchemy.Time,
        'list': sqlalchemy.ARRAY,
        'dict': sqlalchemy.JSON,
        'uuid': sqlalchemy.Uuid,
    }

    def __init__(self, db_type: str, user: str, password: str, host: str, port: int, db_name: str, *,
                 options_str: str = None, tables_info_dict: Dict = None):
        db_type = db_type.lower()
        if db_type not in self.DB_TYPE_SUPPORTED:
            raise ValueError(f'{db_type} not supported')
        super().__init__(db_type)
        self._user = user
        self._password = password
        self._host = host
        self._port = port
        self._db_name = db_name
        self._tables_desc_dict = {}
        self._visible_tables = None
        self._metadata = sqlalchemy.MetaData()
        self._options_str = options_str
        self._orm_cache = {}
        self._engine = None
        self._Session = None
        if tables_info_dict:
            self._init_tables_by_info(tables_info_dict)

    def _init_tables_by_info(self, tables_info_dict):
        try:
            tables_info = TablesInfo.model_validate(tables_info_dict)
            self._visible_tables = [table_info.name for table_info in tables_info.tables]
            # create table if not exist
            self._create_tables_by_info(tables_info)
            desc_dict = self._gen_desc_by_info(tables_info)
            self.set_desc(desc_dict)
        except pydantic.ValidationError as e:
            raise ValueError(f'Validate tables_info_dict failed: {str(e)}')

    def _sql_type_for(self, py_type: str, *, is_primary_key: bool = False):
        t = py_type.lower()
        if self._db_type in ('mysql', 'tidb', 'mysql+pymysql'):
            # MySQL/TiDB do not allow TEXT/BLOB columns to be used as primary keys
            # without a prefix length. Use VARCHAR for identifier-like key columns.
            if is_primary_key and t in ('string', 'text'):
                return sqlalchemy.String(255)
            if t == 'list':
                return sqlalchemy.JSON
            if t == 'uuid':
                return sqlalchemy.String(36)
        return self.PYTYPE_TO_SQL_MAP.get(t, sqlalchemy.Text)

    def _create_tables_by_info(self, tables_info: TablesInfo):
        for table_info in tables_info.tables:
            attrs = {'__tablename__': table_info.name, '__table_args__': {'extend_existing': True},
                     'metadata': self._metadata}
            for column_info in table_info.columns:
                column_type = column_info.data_type.lower()
                is_nullable = column_info.nullable
                column_name = column_info.name
                is_primary = column_info.is_primary_key
                default_value = column_info.default
                # Keep cross-db compatibility while handling MySQL/TiDB PK restrictions.
                real_type = self._sql_type_for(column_type, is_primary_key=is_primary)
                # Handle default value
                # For non-integer primary keys, disable autoincrement so SQLAlchemy
                # always includes the column in INSERT statements.
                autoincrement = 'auto' if not is_primary else (
                    'auto' if column_type in ('integer', 'int') else False
                )
                if default_value is not None:
                    attrs[column_name] = sqlalchemy.Column(real_type, nullable=is_nullable,
                                                           primary_key=is_primary, default=default_value,
                                                           autoincrement=autoincrement)
                else:
                    attrs[column_name] = sqlalchemy.Column(real_type, nullable=is_nullable,
                                                           primary_key=is_primary,
                                                           autoincrement=autoincrement)
            # When create dynamic class with same name, old version will be replaced
            TableClass = type(table_info.name.capitalize(), (TableBase,), attrs)
            self.create_table(TableClass)

    def _gen_desc_by_info(self, tables_info: TablesInfo) -> dict:
        desc_dict = {}
        for table_info in tables_info.tables:
            table_comment = ''
            if table_info.comment:
                table_comment += f'COMMENT ON TABLE "{table_info.name}": {table_info.comment}\n'
            for column_info in table_info.columns:
                table_comment += f'COMMENT ON COLUMN "{table_info.name}.{column_info.name}": {column_info.comment}\n'
            if table_comment:
                desc_dict[table_info.name] = table_comment
        return desc_dict

    def _gen_conn_url(self, db_name: str = None) -> str:
        db_name = self._db_name if db_name is None else db_name
        if self._db_type == 'sqlite':
            conn_url = f'sqlite:///{db_name}{("?" + self._options_str) if self._options_str else ""}'
        else:
            driver = self.DB_DRIVER_MAP.get(self._db_type if self._db_type != 'tidb' else 'mysql', '')
            password = quote_plus(self._password)
            prefix = 'mysql' if self._db_type == 'tidb' else self._db_type
            db_path = f'/{db_name}' if db_name else '/'
            conn_url = (f'{prefix}{("+" + driver) if driver else ""}://{self._user}:{password}@{self._host}'
                        f':{self._port}{db_path}{("?" + self._options_str) if self._options_str else ""}')
        return conn_url

    def _mysql_engine_kwargs(self) -> dict:
        kwargs = {
            'pool_size': 10,
            'max_overflow': 20,
            'pool_pre_ping': True,
        }
        if self._db_type == 'tidb':
            kwargs.update({'pool_recycle': 300, 'connect_args': {}, 'echo': False})
        else:
            kwargs.update({'pool_recycle': 3600})
        return kwargs

    @staticmethod
    def _default_engine_kwargs() -> dict:
        return {
            'pool_size': 10,
            'max_overflow': 20,
            'pool_pre_ping': True,
            'pool_recycle': 3600,
        }

    @staticmethod
    def _get_operational_error_code(error: OperationalError):
        args = getattr(getattr(error, 'orig', None), 'args', ())
        return args[0] if args else None

    @staticmethod
    def _get_operational_error_pgcode(error: OperationalError):
        orig = getattr(error, 'orig', None)
        return getattr(orig, 'pgcode', None) or getattr(orig, 'sqlstate', None)

    def _is_database_not_found_error(self, error: OperationalError) -> bool:
        if self._db_type in ('mysql', 'mysql+pymysql', 'tidb'):
            return self._get_operational_error_code(error) == 1049
        if self._db_type == 'postgresql':
            if self._get_operational_error_pgcode(error) == '3D000':
                return True
            error_msg = str(getattr(error, 'orig', error)).lower()
            return 'does not exist' in error_msg and 'database' in error_msg
        return False

    def _ensure_database_exists(self, conn_url: str):
        if self._db_type not in ('mysql', 'mysql+pymysql', 'tidb', 'postgresql'):
            return
        engine_kwargs = self._mysql_engine_kwargs() if self._db_type in ('mysql', 'mysql+pymysql', 'tidb') \
            else self._default_engine_kwargs()
        probe_engine = sqlalchemy.create_engine(conn_url, **engine_kwargs)
        try:
            with probe_engine.connect():
                return
        except OperationalError as e:
            if not self._is_database_not_found_error(e):
                raise
        finally:
            probe_engine.dispose()

        if self._db_type == 'postgresql':
            admin_engine = sqlalchemy.create_engine(
                self._gen_conn_url('postgres'),
                isolation_level='AUTOCOMMIT',
                **self._default_engine_kwargs()
            )
        else:
            admin_engine = sqlalchemy.create_engine(self._gen_conn_url(''), **self._mysql_engine_kwargs())
        try:
            with admin_engine.connect() as conn:
                if self._db_type == 'postgresql':
                    exists = conn.execute(
                        sqlalchemy.text('SELECT 1 FROM pg_database WHERE datname = :db_name'),
                        {'db_name': self._db_name}
                    ).scalar()
                    if not exists:
                        escaped_db_name = self._db_name.replace('"', '""')
                        conn.execute(sqlalchemy.text(f'CREATE DATABASE "{escaped_db_name}"'))
                else:
                    escaped_db_name = self._db_name.replace('`', '``')
                    conn.execute(sqlalchemy.text(f'CREATE DATABASE IF NOT EXISTS `{escaped_db_name}`'))
                    conn.commit()
        finally:
            admin_engine.dispose()

    @property
    def engine(self):
        if self._engine is None:
            conn_url = self._gen_conn_url()
            if self._db_type == 'sqlite':
                self._engine = sqlalchemy.create_engine(
                    conn_url,
                    connect_args={'check_same_thread': False, 'timeout': 30},
                    poolclass=sqlalchemy.pool.QueuePool,
                    echo=False
                )
                with self._engine.connect() as conn:
                    conn.execute(sqlalchemy.text('PRAGMA journal_mode=WAL'))
                    conn.execute(sqlalchemy.text('PRAGMA synchronous=NORMAL'))
                    conn.execute(sqlalchemy.text('PRAGMA busy_timeout=30000'))
                    conn.commit()
            elif self._db_type in ('mysql', 'mysql+pymysql', 'tidb'):
                self._ensure_database_exists(conn_url)
                self._engine = sqlalchemy.create_engine(conn_url, **self._mysql_engine_kwargs())
            elif self._db_type == 'postgresql':
                self._ensure_database_exists(conn_url)
                self._engine = sqlalchemy.create_engine(conn_url, **self._default_engine_kwargs())
            else:
                self._engine = sqlalchemy.create_engine(conn_url, **self._default_engine_kwargs())
        return self._engine

    @property
    def Session(self):
        if self._Session is None:
            self._Session = sessionmaker(bind=self.engine, expire_on_commit=False)
        return self._Session

    def dispose(self):
        """Release the underlying engine's connection pool.

        Needed so callers (e.g. DocServer._Impl.stop) can close sqlite file handles
        before removing the containing directory; on Windows this is required, because
        open handles block ``TemporaryDirectory`` cleanup.
        """
        if self._engine is not None:
            try:
                self._engine.dispose()
            except Exception:
                pass
            self._engine = None
        self._Session = None

    @contextmanager
    def get_session(self, session=None):
        """A database session context manager.

When called with the default ``session=None`` it opens a new SQLAlchemy session, commits on success, rolls back on exception, and unconditionally closes the session on exit.

When an external ``session`` is passed in, this context manager is transparent: the session is yielded as-is without commit / rollback / close — ownership stays with whoever originally opened it. Exceptions raised inside the ``with`` block still propagate and trigger rollback at the owning ``get_session`` up the call chain. This allows helper methods to participate in a caller-driven transaction via an optional ``session=None`` parameter without having to know whether they own the session.

Args:
    session (Optional[Session]): An already-open SQLAlchemy session from an outer scope. When ``None``, a new session is opened and managed by this context; otherwise the passed session is yielded as-is.
"""
        if session is not None:
            yield session
            return
        session = self.Session()
        try:
            yield session
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

    @staticmethod
    def paginate(query, *, page: int = 1, page_size: int = 20) -> Dict[str, Any]:
        """Apply page-based pagination to a SQLAlchemy ``Query``.

Clamps ``page`` and ``page_size`` to at least ``1``, issues a ``COUNT`` against the unpaginated query to obtain the total row count, then fetches the current page using ``OFFSET`` / ``LIMIT``. Returns a dict shaped ``{'items', 'total', 'page', 'page_size'}``.

The returned ``items`` are raw SQLAlchemy row results — no business conversion is performed. Callers are responsible for converting rows to their desired shape (for example via ``_orm_to_dict``), so this helper stays domain-agnostic and reusable across any listing endpoint.

Args:
    query (Query): A SQLAlchemy ``Query`` with the desired ``filter`` / ``order_by`` already applied.
    page (int): 1-based page number; values below 1 are clamped to 1.
    page_size (int): Number of rows per page; values below 1 are clamped to 1.
"""
        page = max(page, 1)
        page_size = max(page_size, 1)
        total = query.count()
        rows = query.offset((page - 1) * page_size).limit(page_size).all()
        return {'items': rows, 'total': total, 'page': page, 'page_size': page_size}

    def check_connection(self) -> DBResult:
        """Check database connection status.

Tests whether the connection to the database is successfully established.

**Returns:**

- DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.
"""
        try:
            with self.engine.connect() as _:
                return DBResult()
        except SQLAlchemyError as e:
            return DBResult(status=DBStatus.FAIL, detail=str(e))

    @property
    def desc(self) -> str:
        if self._desc is None:
            self.set_desc(tables_desc_dict={})
        return self._desc

    def set_desc(self, tables_desc_dict: dict = {}):  # noqa B006
        """When using SqlManager with LLM to query table entries in natural language, set descriptions for better results, especially when table names, column names, and values are not self-explanatory.

Args:
    tables_desc_dict (dict): descriptive comment for tables
"""
        self._desc = ''
        if not isinstance(tables_desc_dict, dict):
            raise ValueError(f'desc type {type(tables_desc_dict)} not supported')
        self._tables_desc_dict = tables_desc_dict
        if len(self.visible_tables) == 0:
            return
        # Generate desc according to table schema and comment
        self._desc = 'The tables description is as follows\n```\n'
        for table_name in self.visible_tables:
            self._desc += f'Table {table_name}\n(\n'
            TableCls = self.get_table_orm_class(table_name)
            if TableCls is None:
                # The table could be dropped in other session
                continue
            table_columns = TableCls.__table__.columns
            for i, column in enumerate(table_columns):
                self._desc += f' {column.name} {column.type}'
                if i != len(table_columns) - 1:
                    self._desc += ','
                self._desc += '\n'
            self._desc += ');\n'
            if table_name in tables_desc_dict:
                self._desc += tables_desc_dict[table_name] + '\n\n'
        self._desc += '```\n'

    @property
    def visible_tables(self):
        if self._visible_tables is None:
            self._visible_tables = self.get_all_tables()
        return self._visible_tables

    @visible_tables.setter
    def visible_tables(self, visible_tables: list):
        all_tables = set(self.get_all_tables())
        for ele in visible_tables:
            if ele not in all_tables:
                raise ValueError(f'Table {ele} not found in database')
        self._visible_tables = visible_tables
        self.set_desc(self._tables_desc_dict)

    def _refresh_metadata(self, only=None):
        # refresh metadata in case of deleting/creating table in other session
        try:
            if only:
                self._metadata.reflect(bind=self.engine, only=only, extend_existing=True)
            elif not self._metadata.tables:
                self._metadata.reflect(bind=self.engine)
        except Exception as e:
            raise ValueError(f'Refresh metadata failed: {e}')

    def get_all_tables(self) -> list:
        """Get list of all tables in the database.

Refreshes metadata and returns all table names in the current database.

**Returns:**

- List[str]: List of all table names in the database
"""
        self._refresh_metadata()
        return list(self._metadata.tables.keys())

    def get_table_orm_class(self, table_name):
        """Get corresponding ORM class by table name.

Reflects and gets SQLAlchemy automapped ORM class through table name.

Args:
    table_name (str): Table name to retrieve

**Returns:**

- sqlalchemy.ext.automap.Class: Corresponding ORM class, returns None if table doesn't exist
"""
        if table_name in self._orm_cache:
            return self._orm_cache[table_name]
        self._refresh_metadata(only=[table_name])
        # SQLAlchemy automap sets autoincrement='auto' for all primary keys, including
        # non-integer ones (TEXT/VARCHAR). This causes SQLAlchemy to omit the PK column
        # from INSERT statements when the value is provided, leading to NOT NULL errors.
        # Explicitly disable autoincrement for non-integer primary key columns before
        # calling Base.prepare() so the compiled INSERT includes the PK column.
        table = self._metadata.tables.get(table_name)
        if table is not None:
            for col in table.primary_key.columns:
                if not isinstance(col.type, sqlalchemy.Integer):
                    col.autoincrement = False
        Base = automap_base(metadata=self._metadata)
        Base.prepare()
        class_obj = getattr(Base.classes, table_name, None)
        self._orm_cache[table_name] = class_obj
        return class_obj

    def execute_commit(self, statement: str):
        """Execute SQL commit statements.

Executes DDL or DML statements and automatically commits transactions. Suitable for CREATE, ALTER, INSERT, UPDATE, DELETE operations.

Args:
    statement (str): SQL statement to execute
"""
        with self.get_session() as session:
            session.execute(sqlalchemy.text(statement))

    def execute_query(self, statement: str) -> str:
        """Execute the SQL query script and return the result as a JSON string.
"""
        statement = re.sub(r'/\*.*?\*/', '', statement, flags=re.DOTALL).strip()
        create_table_pattern = r'.*\s*create\s+table\s+.*'
        drop_table_pattern = r'.*\s*drop\s+table\s+.*'
        statement_lower = statement.lower()
        if re.match(create_table_pattern, statement_lower):
            return f'Create table not supported. Original statement: {statement}'
        elif re.match(drop_table_pattern, statement_lower):
            return f'Drop table not supported. Original statement: {statement}'
        try:
            result = []
            session = self.Session()
            # Use original session without post commit
            with session as session:
                cursor_result = session.execute(sqlalchemy.text(statement))
                columns = list(cursor_result.keys())
                result = [dict(zip(columns, row)) for row in cursor_result]
            str_result = json.dumps(result, ensure_ascii=False, default=self._serialize_uncommon_type)
        except Exception as e:
            str_result = f'Execute SQL ERROR: {str(e)}'
        return str_result

    def _create_by_script(self, table: str) -> DBResult:
        status = DBStatus.SUCCESS
        detail = 'Success'
        try:
            with self.engine.connect() as conn:
                conn.execute(sqlalchemy.text(table))
                conn.commit()
        except OperationalError as e:
            status = DBStatus.FAIL
            detail = f'ERROR: {str(e)}'
        return DBResult(status=status, detail=detail)

    def _create_by_api(self, table: Union[DeclarativeBase, DeclarativeMeta]) -> DBResult:
        table.metadata.create_all(bind=self.engine, checkfirst=True)
        return DBResult()

    def create_table(self, table: Union[str, Type[DeclarativeBase], DeclarativeMeta]) -> DBResult:
        """Create a table

Args:
    table (str/Type[DeclarativeBase]/DeclarativeMeta): table schema。Supports three types of parameters: SQL statements with type str, ORM classes that inherit from DeclarativeBase or declarative_base().
"""
        status = DBStatus.SUCCESS
        detail = 'Success'
        if isinstance(table, str):
            return self._create_by_script(table)
        # Support DeclarativeMeta created by declarative_base() which is deprecated since: 2.0
        elif issubclass(table, DeclarativeBase) or isinstance(table, DeclarativeMeta):
            return self._create_by_api(table)
        else:
            status = DBStatus.FAIL
            detail += f'Failed: Unsupported Type: {table}'
        return DBResult(status=status, detail=detail)

    def drop_table(self, table: Union[str, Type[DeclarativeBase], DeclarativeMeta]) -> DBResult:
        """Delete a table

Args:
    table (str/Type[DeclarativeBase]/DeclarativeMeta): table schema。Supports three types of parameters: Table name with type str, ORM classes that inherit from DeclarativeBase or declarative_base().
"""
        metadata = self._metadata
        if isinstance(table, str):
            tablename = table
        elif issubclass(table, DeclarativeBase) or isinstance(table, DeclarativeMeta):
            tablename = table.__tablename__
        else:
            return DBResult(status=DBStatus.FAIL, detail=f'{table} type unsupported')
        Table = sqlalchemy.Table(tablename, metadata, autoload_with=self.engine)
        Table.drop(self.engine, checkfirst=True)
        return DBResult()

    def insert_values(self, table_name: str, vals: List[dict]) -> DBResult:
        """Bulk insert data

Args:
    table_name (str): Table name
    vals (List[dict]): data to be inserted, format as [{"col_name1": v01, "col_name2": v02, ...}, {"col_name1": v11, "col_name2": v12, ...}, ...]
"""
        TableCls = self.get_table_orm_class(table_name)
        if TableCls is None:
            return DBResult(status=DBStatus.FAIL, detail=f'{table_name} not found in database')
        try:
            with self.get_session() as session:
                objects = [TableCls(**v) for v in vals]
                session.add_all(objects)
            return DBResult()
        except Exception as e:
            return DBResult(status=DBStatus.FAIL, detail=f'Insert failed: {e}')

`check_connection()`

Check database connection status.

Tests whether the connection to the database is successfully established.

Returns:

DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.

Source code in lazyllm/tools/sql/sql_manager.py

    def check_connection(self) -> DBResult:
        """Check database connection status.

Tests whether the connection to the database is successfully established.

**Returns:**

- DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.
"""
        try:
            with self.engine.connect() as _:
                return DBResult()
        except SQLAlchemyError as e:
            return DBResult(status=DBStatus.FAIL, detail=str(e))

`create_table(table)`

Create a table

Parameters:

table (str / Type[DeclarativeBase] / DeclarativeMeta) –

table schema。Supports three types of parameters: SQL statements with type str, ORM classes that inherit from DeclarativeBase or declarative_base().

Source code in lazyllm/tools/sql/sql_manager.py

    def create_table(self, table: Union[str, Type[DeclarativeBase], DeclarativeMeta]) -> DBResult:
        """Create a table

Args:
    table (str/Type[DeclarativeBase]/DeclarativeMeta): table schema。Supports three types of parameters: SQL statements with type str, ORM classes that inherit from DeclarativeBase or declarative_base().
"""
        status = DBStatus.SUCCESS
        detail = 'Success'
        if isinstance(table, str):
            return self._create_by_script(table)
        # Support DeclarativeMeta created by declarative_base() which is deprecated since: 2.0
        elif issubclass(table, DeclarativeBase) or isinstance(table, DeclarativeMeta):
            return self._create_by_api(table)
        else:
            status = DBStatus.FAIL
            detail += f'Failed: Unsupported Type: {table}'
        return DBResult(status=status, detail=detail)

`dispose()`

Release the underlying engine's connection pool.

Needed so callers (e.g. DocServer._Impl.stop) can close sqlite file handles before removing the containing directory; on Windows this is required, because open handles block TemporaryDirectory cleanup.

Source code in lazyllm/tools/sql/sql_manager.py

def dispose(self):
    """Release the underlying engine's connection pool.

    Needed so callers (e.g. DocServer._Impl.stop) can close sqlite file handles
    before removing the containing directory; on Windows this is required, because
    open handles block ``TemporaryDirectory`` cleanup.
    """
    if self._engine is not None:
        try:
            self._engine.dispose()
        except Exception:
            pass
        self._engine = None
    self._Session = None

`drop_table(table)`

Delete a table

Parameters:

table (str / Type[DeclarativeBase] / DeclarativeMeta) –

table schema。Supports three types of parameters: Table name with type str, ORM classes that inherit from DeclarativeBase or declarative_base().

Source code in lazyllm/tools/sql/sql_manager.py

    def drop_table(self, table: Union[str, Type[DeclarativeBase], DeclarativeMeta]) -> DBResult:
        """Delete a table

Args:
    table (str/Type[DeclarativeBase]/DeclarativeMeta): table schema。Supports three types of parameters: Table name with type str, ORM classes that inherit from DeclarativeBase or declarative_base().
"""
        metadata = self._metadata
        if isinstance(table, str):
            tablename = table
        elif issubclass(table, DeclarativeBase) or isinstance(table, DeclarativeMeta):
            tablename = table.__tablename__
        else:
            return DBResult(status=DBStatus.FAIL, detail=f'{table} type unsupported')
        Table = sqlalchemy.Table(tablename, metadata, autoload_with=self.engine)
        Table.drop(self.engine, checkfirst=True)
        return DBResult()

`execute_commit(statement)`

Execute SQL commit statements.

Executes DDL or DML statements and automatically commits transactions. Suitable for CREATE, ALTER, INSERT, UPDATE, DELETE operations.

Parameters:

statement (str) –

SQL statement to execute

Source code in lazyllm/tools/sql/sql_manager.py

    def execute_commit(self, statement: str):
        """Execute SQL commit statements.

Executes DDL or DML statements and automatically commits transactions. Suitable for CREATE, ALTER, INSERT, UPDATE, DELETE operations.

Args:
    statement (str): SQL statement to execute
"""
        with self.get_session() as session:
            session.execute(sqlalchemy.text(statement))

`execute_query(statement)`

Execute the SQL query script and return the result as a JSON string.

Source code in lazyllm/tools/sql/sql_manager.py

    def execute_query(self, statement: str) -> str:
        """Execute the SQL query script and return the result as a JSON string.
"""
        statement = re.sub(r'/\*.*?\*/', '', statement, flags=re.DOTALL).strip()
        create_table_pattern = r'.*\s*create\s+table\s+.*'
        drop_table_pattern = r'.*\s*drop\s+table\s+.*'
        statement_lower = statement.lower()
        if re.match(create_table_pattern, statement_lower):
            return f'Create table not supported. Original statement: {statement}'
        elif re.match(drop_table_pattern, statement_lower):
            return f'Drop table not supported. Original statement: {statement}'
        try:
            result = []
            session = self.Session()
            # Use original session without post commit
            with session as session:
                cursor_result = session.execute(sqlalchemy.text(statement))
                columns = list(cursor_result.keys())
                result = [dict(zip(columns, row)) for row in cursor_result]
            str_result = json.dumps(result, ensure_ascii=False, default=self._serialize_uncommon_type)
        except Exception as e:
            str_result = f'Execute SQL ERROR: {str(e)}'
        return str_result

`get_all_tables()`

Get list of all tables in the database.

Refreshes metadata and returns all table names in the current database.

Returns:

List[str]: List of all table names in the database

Source code in lazyllm/tools/sql/sql_manager.py

    def get_all_tables(self) -> list:
        """Get list of all tables in the database.

Refreshes metadata and returns all table names in the current database.

**Returns:**

- List[str]: List of all table names in the database
"""
        self._refresh_metadata()
        return list(self._metadata.tables.keys())

`get_session(session=None)`

A database session context manager.

When called with the default session=None it opens a new SQLAlchemy session, commits on success, rolls back on exception, and unconditionally closes the session on exit.

When an external session is passed in, this context manager is transparent: the session is yielded as-is without commit / rollback / close — ownership stays with whoever originally opened it. Exceptions raised inside the with block still propagate and trigger rollback at the owning get_session up the call chain. This allows helper methods to participate in a caller-driven transaction via an optional session=None parameter without having to know whether they own the session.

Parameters:

session (Optional[Session], default: None ) –

An already-open SQLAlchemy session from an outer scope. When None, a new session is opened and managed by this context; otherwise the passed session is yielded as-is.

Source code in lazyllm/tools/sql/sql_manager.py

    @contextmanager
    def get_session(self, session=None):
        """A database session context manager.

When called with the default ``session=None`` it opens a new SQLAlchemy session, commits on success, rolls back on exception, and unconditionally closes the session on exit.

When an external ``session`` is passed in, this context manager is transparent: the session is yielded as-is without commit / rollback / close — ownership stays with whoever originally opened it. Exceptions raised inside the ``with`` block still propagate and trigger rollback at the owning ``get_session`` up the call chain. This allows helper methods to participate in a caller-driven transaction via an optional ``session=None`` parameter without having to know whether they own the session.

Args:
    session (Optional[Session]): An already-open SQLAlchemy session from an outer scope. When ``None``, a new session is opened and managed by this context; otherwise the passed session is yielded as-is.
"""
        if session is not None:
            yield session
            return
        session = self.Session()
        try:
            yield session
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

`get_table_orm_class(table_name)`

Get corresponding ORM class by table name.

Reflects and gets SQLAlchemy automapped ORM class through table name.

Parameters:

table_name (str) –

Table name to retrieve

Returns:

sqlalchemy.ext.automap.Class: Corresponding ORM class, returns None if table doesn't exist

Source code in lazyllm/tools/sql/sql_manager.py

    def get_table_orm_class(self, table_name):
        """Get corresponding ORM class by table name.

Reflects and gets SQLAlchemy automapped ORM class through table name.

Args:
    table_name (str): Table name to retrieve

**Returns:**

- sqlalchemy.ext.automap.Class: Corresponding ORM class, returns None if table doesn't exist
"""
        if table_name in self._orm_cache:
            return self._orm_cache[table_name]
        self._refresh_metadata(only=[table_name])
        # SQLAlchemy automap sets autoincrement='auto' for all primary keys, including
        # non-integer ones (TEXT/VARCHAR). This causes SQLAlchemy to omit the PK column
        # from INSERT statements when the value is provided, leading to NOT NULL errors.
        # Explicitly disable autoincrement for non-integer primary key columns before
        # calling Base.prepare() so the compiled INSERT includes the PK column.
        table = self._metadata.tables.get(table_name)
        if table is not None:
            for col in table.primary_key.columns:
                if not isinstance(col.type, sqlalchemy.Integer):
                    col.autoincrement = False
        Base = automap_base(metadata=self._metadata)
        Base.prepare()
        class_obj = getattr(Base.classes, table_name, None)
        self._orm_cache[table_name] = class_obj
        return class_obj

`insert_values(table_name, vals)`

Bulk insert data

Parameters:

table_name (str) –

Table name
vals (List[dict]) –

data to be inserted, format as [{"col_name1": v01, "col_name2": v02, ...}, {"col_name1": v11, "col_name2": v12, ...}, ...]

Source code in lazyllm/tools/sql/sql_manager.py

    def insert_values(self, table_name: str, vals: List[dict]) -> DBResult:
        """Bulk insert data

Args:
    table_name (str): Table name
    vals (List[dict]): data to be inserted, format as [{"col_name1": v01, "col_name2": v02, ...}, {"col_name1": v11, "col_name2": v12, ...}, ...]
"""
        TableCls = self.get_table_orm_class(table_name)
        if TableCls is None:
            return DBResult(status=DBStatus.FAIL, detail=f'{table_name} not found in database')
        try:
            with self.get_session() as session:
                objects = [TableCls(**v) for v in vals]
                session.add_all(objects)
            return DBResult()
        except Exception as e:
            return DBResult(status=DBStatus.FAIL, detail=f'Insert failed: {e}')

`paginate(query, *, page=1, page_size=20)` `staticmethod`

Apply page-based pagination to a SQLAlchemy Query.

Clamps page and page_size to at least 1, issues a COUNT against the unpaginated query to obtain the total row count, then fetches the current page using OFFSET / LIMIT. Returns a dict shaped {'items', 'total', 'page', 'page_size'}.

The returned items are raw SQLAlchemy row results — no business conversion is performed. Callers are responsible for converting rows to their desired shape (for example via _orm_to_dict), so this helper stays domain-agnostic and reusable across any listing endpoint.

Parameters:

query (Query) –

A SQLAlchemy Query with the desired filter / order_by already applied.
page (int, default: 1 ) –

1-based page number; values below 1 are clamped to 1.
page_size (int, default: 20 ) –

Number of rows per page; values below 1 are clamped to 1.

Source code in lazyllm/tools/sql/sql_manager.py

    @staticmethod
    def paginate(query, *, page: int = 1, page_size: int = 20) -> Dict[str, Any]:
        """Apply page-based pagination to a SQLAlchemy ``Query``.

Clamps ``page`` and ``page_size`` to at least ``1``, issues a ``COUNT`` against the unpaginated query to obtain the total row count, then fetches the current page using ``OFFSET`` / ``LIMIT``. Returns a dict shaped ``{'items', 'total', 'page', 'page_size'}``.

The returned ``items`` are raw SQLAlchemy row results — no business conversion is performed. Callers are responsible for converting rows to their desired shape (for example via ``_orm_to_dict``), so this helper stays domain-agnostic and reusable across any listing endpoint.

Args:
    query (Query): A SQLAlchemy ``Query`` with the desired ``filter`` / ``order_by`` already applied.
    page (int): 1-based page number; values below 1 are clamped to 1.
    page_size (int): Number of rows per page; values below 1 are clamped to 1.
"""
        page = max(page, 1)
        page_size = max(page_size, 1)
        total = query.count()
        rows = query.offset((page - 1) * page_size).limit(page_size).all()
        return {'items': rows, 'total': total, 'page': page, 'page_size': page_size}

`set_desc(tables_desc_dict={})`

When using SqlManager with LLM to query table entries in natural language, set descriptions for better results, especially when table names, column names, and values are not self-explanatory.

Parameters:

tables_desc_dict (dict, default: {} ) –

descriptive comment for tables

Source code in lazyllm/tools/sql/sql_manager.py

    def set_desc(self, tables_desc_dict: dict = {}):  # noqa B006
        """When using SqlManager with LLM to query table entries in natural language, set descriptions for better results, especially when table names, column names, and values are not self-explanatory.

Args:
    tables_desc_dict (dict): descriptive comment for tables
"""
        self._desc = ''
        if not isinstance(tables_desc_dict, dict):
            raise ValueError(f'desc type {type(tables_desc_dict)} not supported')
        self._tables_desc_dict = tables_desc_dict
        if len(self.visible_tables) == 0:
            return
        # Generate desc according to table schema and comment
        self._desc = 'The tables description is as follows\n```\n'
        for table_name in self.visible_tables:
            self._desc += f'Table {table_name}\n(\n'
            TableCls = self.get_table_orm_class(table_name)
            if TableCls is None:
                # The table could be dropped in other session
                continue
            table_columns = TableCls.__table__.columns
            for i, column in enumerate(table_columns):
                self._desc += f' {column.name} {column.type}'
                if i != len(table_columns) - 1:
                    self._desc += ','
                self._desc += '\n'
            self._desc += ');\n'
            if table_name in tables_desc_dict:
                self._desc += tables_desc_dict[table_name] + '\n\n'
        self._desc += '```\n'

`lazyllm.tools.Reranker`

Bases: ModuleBase, _PostProcess

Initializes a Rerank module for postprocessing and reranking of nodes (documents). This constructor initializes a Reranker module that configures a reranking process based on a specified reranking type. It allows for the dynamic selection and instantiation of reranking kernels (algorithms) based on the type and provided keyword arguments.

Parameters:

name (str, default: 'ModuleReranker' ) –

The type of reranker used for the postprocessing and reranking process. Defaults to 'ModuleReranker'.
target (str, default: None ) –

Deprecated parameter, only used to notify users.
output_format (Optional[str], default: None ) –

Specifies the output format. Defaults to None. Optional values include 'content' and 'dict'. - 'content' means the output is in string format. - 'dict' means the output is a dictionary.
join (Union[bool, str], default: False ) –

Determines whether to join the top-k output nodes. - When output_format is 'content': - If set to True, returns a single long string. - If set to False, returns a list of strings, each representing one node’s content. - When output_format is 'dict': - Joining is not supported; join defaults to False. - Returns a dictionary with three keys: 'content', 'embedding', and 'metadata'.
kwargs –

Additional keyword arguments passed to the reranker upon instantiation.

Detailed explanation of reranker types

Reranker: Instantiates a SentenceTransformerRerank reranker with a list of document nodes and a query.
KeywordFilter: This registered reranking function instantiates a KeywordNodePostprocessor with specified required and excluded keywords. It filters nodes based on the presence or absence of these keywords.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, Reranker, Retriever, DocNode
>>> m = lazyllm.OnlineEmbeddingModule()
>>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> retriever = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
>>> reranker = Reranker(DocNode(text=user_data),query="user query")
>>> ppl = lazyllm.ActionModule(retriever, reranker)
>>> ppl.start()
>>> print(ppl("user query"))

Source code in lazyllm/tools/rag/rerank.py

class Reranker(ModuleBase, _PostProcess):
    """Initializes a Rerank module for postprocessing and reranking of nodes (documents).
This constructor initializes a Reranker module that configures a reranking process based on a specified reranking type. It allows for the dynamic selection and instantiation of reranking kernels (algorithms) based on the type and provided keyword arguments.

Args:
    name: The type of reranker used for the postprocessing and reranking process. Defaults to 'ModuleReranker'.
    target (str): **Deprecated** parameter, only used to notify users.
    output_format: Specifies the output format. Defaults to None. Optional values include 'content' and 'dict'.
        - 'content' means the output is in string format.
        - 'dict' means the output is a dictionary.
    join: Determines whether to join the top-k output nodes.
        - When `output_format` is 'content':
            - If set to True, returns a single long string.
            - If set to False, returns a list of strings, each representing one node’s content.
        - When `output_format` is 'dict':
            - Joining is not supported; `join` defaults to False.
            - Returns a dictionary with three keys: 'content', 'embedding', and 'metadata'.
    kwargs: Additional keyword arguments passed to the reranker upon instantiation.

**Detailed explanation of reranker types**

- Reranker: Instantiates a `SentenceTransformerRerank` reranker with a list of document nodes and a query.

- KeywordFilter: This registered reranking function instantiates a KeywordNodePostprocessor with specified required and excluded keywords. It filters nodes based on the presence or absence of these keywords.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, Reranker, Retriever, DocNode
    >>> m = lazyllm.OnlineEmbeddingModule()
    >>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> retriever = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
    >>> reranker = Reranker(DocNode(text=user_data),query="user query")
    >>> ppl = lazyllm.ActionModule(retriever, reranker)
    >>> ppl.start()
    >>> print(ppl("user query"))
    """
    registered_reranker = dict()

    def __new__(cls, name: str = 'ModuleReranker', *args, **kwargs):
        assert name in cls.registered_reranker, f'Reranker: {name} is not registered, please register first.'
        item = cls.registered_reranker[name]
        if isinstance(item, type) and issubclass(item, Reranker):
            return super(Reranker, cls).__new__(item)
        else:
            return super(Reranker, cls).__new__(cls)

    def __init__(self, name: str = 'ModuleReranker', target: Optional[str] = None,
                 output_format: Optional[str] = None, join: Union[bool, str] = False, **kwargs) -> None:
        super().__init__()
        self._name = name
        self._kwargs = kwargs
        lazyllm.deprecated(bool(target), '`target` parameter of reranker')
        _PostProcess.__init__(self, output_format, join)

    def forward(self, nodes: List[DocNode], query: str = '', topk: Optional[int] = None) -> List[DocNode]:
        kwargs = dict(self._kwargs)
        if topk is not None:
            kwargs['topk'] = topk
        results = self.registered_reranker[self._name](nodes, query=query, **kwargs)
        LOG.debug(f'Rerank use `{self._name}` and get nodes: {results}')
        return self._post_process(results)

    @classmethod
    def register_reranker(
        cls: 'Reranker', func: Optional[Callable] = None, batch: bool = False
    ):
        """A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the `Reranker` class.

Args:
    func (Optional[Callable]): The reranking function or class to register. This can be omitted when using decorator syntax (@).
    batch (bool): Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.


Examples:

    @Reranker.register_reranker
    def my_reranker(node: DocNode, **kwargs):
        return node.score * 0.8  # 自定义分数计算
    """
        def decorator(f):
            if isinstance(f, type):
                cls.registered_reranker[f.__name__] = f
                return f
            else:
                def wrapper(nodes, **kwargs):
                    if batch:
                        return f(nodes, **kwargs)
                    else:
                        results = [f(node, **kwargs) for node in nodes]
                        return [result for result in results if result]

                cls.registered_reranker[f.__name__] = wrapper
                return wrapper

        return decorator(func) if func else decorator

`register_reranker(func=None, batch=False)` `classmethod`

A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the Reranker class.

Parameters:

func (Optional[Callable], default: None ) –

The reranking function or class to register. This can be omitted when using decorator syntax (@).
batch (bool, default: False ) –

Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.

Examples:

@Reranker.register_reranker
def my_reranker(node: DocNode, **kwargs):
    return node.score * 0.8  # 自定义分数计算

Source code in lazyllm/tools/rag/rerank.py

    @classmethod
    def register_reranker(
        cls: 'Reranker', func: Optional[Callable] = None, batch: bool = False
    ):
        """A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the `Reranker` class.

Args:
    func (Optional[Callable]): The reranking function or class to register. This can be omitted when using decorator syntax (@).
    batch (bool): Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.


Examples:

    @Reranker.register_reranker
    def my_reranker(node: DocNode, **kwargs):
        return node.score * 0.8  # 自定义分数计算
    """
        def decorator(f):
            if isinstance(f, type):
                cls.registered_reranker[f.__name__] = f
                return f
            else:
                def wrapper(nodes, **kwargs):
                    if batch:
                        return f(nodes, **kwargs)
                    else:
                        results = [f(node, **kwargs) for node in nodes]
                        return [result for result in results if result]

                cls.registered_reranker[f.__name__] = wrapper
                return wrapper

        return decorator(func) if func else decorator

`lazyllm.tools.rag.readers.readerBase.LazyLLMReaderBase`

Bases: ModuleBase

Base document reader class that provides fundamental interfaces for document loading. Inherits from ModuleBase and uses LazyLLMRegisterMetaClass as metaclass.

Parameters:

*args –

Positional arguments, reserved for parent or subclass use.
return_trace (bool, default: True ) –

Whether to return processing trace information. Defaults to True.
**kwargs –

Keyword arguments, reserved for parent or subclass use.

Examples:

from lazyllm.tools.rag.readers.readerBase import LazyLLMReaderBase
from lazyllm.tools.rag.doc_node import DocNode
from typing import Iterable

class CustomReader(LazyLLMReaderBase):
    def _lazy_load_data(self, file_paths: list, **kwargs) -> Iterable[DocNode]:
        for file_path in file_paths:
            # Process each file and yield DocNode
            content = self._read_file(file_path)
            yield DocNode(
                text=content,
                metadata={"source": file_path}
            )

# Create reader instance
reader = CustomReader(return_trace=True)

# Load documents
documents = reader.forward(file_paths=["doc1.txt", "doc2.txt"])

Source code in lazyllm/tools/rag/readers/readerBase.py

class LazyLLMReaderBase(ModuleBase, metaclass=LazyLLMRegisterMetaClass):
    """
Base document reader class that provides fundamental interfaces for document loading. Inherits from ModuleBase and uses LazyLLMRegisterMetaClass as metaclass.

Args:
    *args: Positional arguments, reserved for parent or subclass use.
    return_trace (bool): Whether to return processing trace information. Defaults to True.
    **kwargs: Keyword arguments, reserved for parent or subclass use.


Examples:

    from lazyllm.tools.rag.readers.readerBase import LazyLLMReaderBase
    from lazyllm.tools.rag.doc_node import DocNode
    from typing import Iterable

    class CustomReader(LazyLLMReaderBase):
        def _lazy_load_data(self, file_paths: list, **kwargs) -> Iterable[DocNode]:
            for file_path in file_paths:
                # Process each file and yield DocNode
                content = self._read_file(file_path)
                yield DocNode(
                    text=content,
                    metadata={"source": file_path}
                )

    # Create reader instance
    reader = CustomReader(return_trace=True)

    # Load documents
    documents = reader.forward(file_paths=["doc1.txt", "doc2.txt"])
    """
    post_action = None

    _encoding_cache = {}
    _cache_lock = threading.Lock()
    _cache_max_size = 1000

    def __init__(self, *args, return_trace: bool = True, **kwargs):
        super().__init__(return_trace=return_trace)

    def _lazy_load_data(self, *args, **load_kwargs) -> Iterable[DocNode]:
        raise NotImplementedError(f'{self.__class__.__name__} does not implement lazy_load_data method.')

    def _load_data(self, *args, **load_kwargs) -> List[DocNode]:
        return list(self._lazy_load_data(*args, **load_kwargs))

    def forward(self, *args, **kwargs) -> List[DocNode]:
        r = self._load_data(*args, **kwargs)
        r = [r] if isinstance(r, DocNode) else [] if r is None else r
        if r and self.post_action:
            r = [x for sub in [self.post_action(n) for n in r] for x in (sub if isinstance(sub, list) else [sub])]
        return r

    @classmethod
    def detect_encoding(cls, file_path: Union[str, Path], fs: Optional['fsspec.AbstractFileSystem'] = None,  # noqa: C901
                        sample_size: int = 10000, use_cache: bool = True,
                        enable_chardet: bool = True) -> str:
        """Detect the encoding of a file.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.

**Returns:**

- str: The encoding of the file.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> encoding = reader.detect_encoding("path/to/file.txt")
    >>> print(encoding)
    """
        if not isinstance(file_path, Path):
            file_path = Path(file_path)

        fs = fs or get_default_fs()

        cache_key = str(file_path) if use_cache else None
        if cache_key:
            with cls._cache_lock:
                if cache_key in cls._encoding_cache:
                    cached_encoding = cls._encoding_cache[cache_key]
                    return cached_encoding

        try:
            with fs.open(file_path, 'rb') as f:
                raw_data = f.read(sample_size)
        except Exception as e:
            LOG.warning(f'Failed to read file {file_path}: {e}')
            return 'utf-8'

        if not raw_data:
            return 'utf-8'

        bom_encodings = [
            (b'\xef\xbb\xbf', 'utf-8-sig'),
            (b'\xff\xfe\x00\x00', 'utf-32-le'),
            (b'\x00\x00\xfe\xff', 'utf-32-be'),
            (b'\xff\xfe', 'utf-16-le'),
            (b'\xfe\xff', 'utf-16-be'),
        ]

        for bom, encoding in bom_encodings:
            if raw_data.startswith(bom):
                cls._cache_encoding(cache_key, encoding)
                return encoding

        has_high_bytes = any(b > 127 for b in raw_data[:1000])

        if has_high_bytes:
            chinese_encodings = ['gb18030', 'gbk', 'gb2312', 'big5']
            # Prefer UTF-8 when valid; otherwise fall back to Chinese encodings.
            # Do not require Chinese chars in the first N chars — CSV headers are often long ASCII.
            if cls._try_decode(raw_data, 'utf-8'):
                cls._cache_encoding(cache_key, 'utf-8')
                return 'utf-8'

            for encoding in chinese_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding
        else:
            primary_encodings = ['utf-8', 'gb18030', 'gbk', 'gb2312', 'big5']
            for encoding in primary_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding

        if cls._try_decode(raw_data, 'latin-1'):
            cls._cache_encoding(cache_key, 'latin-1')
            return 'latin-1'

        if enable_chardet:
            try:
                detected = charset_normalizer.from_path(file_path).best().encoding
                if detected:
                    cls._cache_encoding(cache_key, detected)
                    return detected
                else:
                    LOG.warning(f'Charset normalizer detection failed: {detected}')
            except Exception as e:
                LOG.warning(f'Charset normalizer detection failed: {e}')

        try:
            system_encoding = locale.getpreferredencoding(False)
            LOG.warning(f'Using system default encoding {system_encoding} for {file_path}')
            cls._cache_encoding(cache_key, system_encoding)
            return system_encoding
        except Exception:
            pass
        LOG.warning(f'Could not detect encoding for {file_path}, using utf-8 as fallback')
        cls._cache_encoding(cache_key, 'utf-8')
        return 'utf-8'

    @staticmethod
    def _try_decode(data: bytes, encoding: str) -> bool:
        try:
            data.decode(encoding)
            return True
        except (UnicodeDecodeError, LookupError):
            return False

    @classmethod
    def _cache_encoding(cls, cache_key: Optional[str], encoding: str) -> None:
        if cache_key is None:
            return

        with cls._cache_lock:
            if len(cls._encoding_cache) >= cls._cache_max_size:
                old_keys = list(cls._encoding_cache.keys())[:100]
                for key in old_keys:
                    del cls._encoding_cache[key]
                LOG.debug(f'Encoding cache cleaned: removed {len(old_keys)} entries')

            cls._encoding_cache[cache_key] = encoding

    @classmethod
    def clear_encoding_cache(cls) -> None:
        """Clear the encoding cache.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> reader.clear_encoding_cache()
    """
        with cls._cache_lock:
            cls._encoding_cache.clear()

    @classmethod
    def get_encoding_cache_stats(cls) -> dict:
        """Get the encoding cache stats.

**Returns:**

- dict: The encoding cache stats.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> stats = reader.get_encoding_cache_stats()
    >>> print(stats)
    """
        with cls._cache_lock:
            return {
                'cache_size': len(cls._encoding_cache),
                'cache_max_size': cls._cache_max_size,
                'usage_ratio': len(cls._encoding_cache) / cls._cache_max_size if cls._cache_max_size > 0 else 0
            }

`clear_encoding_cache()` `classmethod`

Clear the encoding cache.

Parameters:

file_path (str) –

The path of the file.
fs (AbstractFileSystem) –

The file system.
sample_size (int) –

The sample size.
use_cache (bool) –

Whether to use cache.
enable_chardet (bool) –

Whether to enable chardet.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> reader.clear_encoding_cache()

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def clear_encoding_cache(cls) -> None:
        """Clear the encoding cache.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> reader.clear_encoding_cache()
    """
        with cls._cache_lock:
            cls._encoding_cache.clear()

`detect_encoding(file_path, fs=None, sample_size=10000, use_cache=True, enable_chardet=True)` `classmethod`

Detect the encoding of a file.

Parameters:

file_path (str) –

The path of the file.
fs (AbstractFileSystem, default: None ) –

The file system.
sample_size (int, default: 10000 ) –

The sample size.
use_cache (bool, default: True ) –

Whether to use cache.
enable_chardet (bool, default: True ) –

Whether to enable chardet.

Returns:

str: The encoding of the file.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> encoding = reader.detect_encoding("path/to/file.txt")
>>> print(encoding)

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def detect_encoding(cls, file_path: Union[str, Path], fs: Optional['fsspec.AbstractFileSystem'] = None,  # noqa: C901
                        sample_size: int = 10000, use_cache: bool = True,
                        enable_chardet: bool = True) -> str:
        """Detect the encoding of a file.

Args:
    file_path (str): The path of the file.
    fs (fsspec.AbstractFileSystem): The file system.
    sample_size (int): The sample size.
    use_cache (bool): Whether to use cache.
    enable_chardet (bool): Whether to enable chardet.

**Returns:**

- str: The encoding of the file.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> encoding = reader.detect_encoding("path/to/file.txt")
    >>> print(encoding)
    """
        if not isinstance(file_path, Path):
            file_path = Path(file_path)

        fs = fs or get_default_fs()

        cache_key = str(file_path) if use_cache else None
        if cache_key:
            with cls._cache_lock:
                if cache_key in cls._encoding_cache:
                    cached_encoding = cls._encoding_cache[cache_key]
                    return cached_encoding

        try:
            with fs.open(file_path, 'rb') as f:
                raw_data = f.read(sample_size)
        except Exception as e:
            LOG.warning(f'Failed to read file {file_path}: {e}')
            return 'utf-8'

        if not raw_data:
            return 'utf-8'

        bom_encodings = [
            (b'\xef\xbb\xbf', 'utf-8-sig'),
            (b'\xff\xfe\x00\x00', 'utf-32-le'),
            (b'\x00\x00\xfe\xff', 'utf-32-be'),
            (b'\xff\xfe', 'utf-16-le'),
            (b'\xfe\xff', 'utf-16-be'),
        ]

        for bom, encoding in bom_encodings:
            if raw_data.startswith(bom):
                cls._cache_encoding(cache_key, encoding)
                return encoding

        has_high_bytes = any(b > 127 for b in raw_data[:1000])

        if has_high_bytes:
            chinese_encodings = ['gb18030', 'gbk', 'gb2312', 'big5']
            # Prefer UTF-8 when valid; otherwise fall back to Chinese encodings.
            # Do not require Chinese chars in the first N chars — CSV headers are often long ASCII.
            if cls._try_decode(raw_data, 'utf-8'):
                cls._cache_encoding(cache_key, 'utf-8')
                return 'utf-8'

            for encoding in chinese_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding
        else:
            primary_encodings = ['utf-8', 'gb18030', 'gbk', 'gb2312', 'big5']
            for encoding in primary_encodings:
                if cls._try_decode(raw_data, encoding):
                    cls._cache_encoding(cache_key, encoding)
                    return encoding

        if cls._try_decode(raw_data, 'latin-1'):
            cls._cache_encoding(cache_key, 'latin-1')
            return 'latin-1'

        if enable_chardet:
            try:
                detected = charset_normalizer.from_path(file_path).best().encoding
                if detected:
                    cls._cache_encoding(cache_key, detected)
                    return detected
                else:
                    LOG.warning(f'Charset normalizer detection failed: {detected}')
            except Exception as e:
                LOG.warning(f'Charset normalizer detection failed: {e}')

        try:
            system_encoding = locale.getpreferredencoding(False)
            LOG.warning(f'Using system default encoding {system_encoding} for {file_path}')
            cls._cache_encoding(cache_key, system_encoding)
            return system_encoding
        except Exception:
            pass
        LOG.warning(f'Could not detect encoding for {file_path}, using utf-8 as fallback')
        cls._cache_encoding(cache_key, 'utf-8')
        return 'utf-8'

`get_encoding_cache_stats()` `classmethod`

Get the encoding cache stats.

Returns:

dict: The encoding cache stats.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
>>> reader = LazyLLMReaderBase()
>>> stats = reader.get_encoding_cache_stats()
>>> print(stats)

Source code in lazyllm/tools/rag/readers/readerBase.py

    @classmethod
    def get_encoding_cache_stats(cls) -> dict:
        """Get the encoding cache stats.

**Returns:**

- dict: The encoding cache stats.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.readers import LazyLLMReaderBase
    >>> reader = LazyLLMReaderBase()
    >>> stats = reader.get_encoding_cache_stats()
    >>> print(stats)
    """
        with cls._cache_lock:
            return {
                'cache_size': len(cls._encoding_cache),
                'cache_max_size': cls._cache_max_size,
                'usage_ratio': len(cls._encoding_cache) / cls._cache_max_size if cls._cache_max_size > 0 else 0
            }

`lazyllm.tools.rag.readers.readerBase.TxtReader`

Bases: LazyLLMReaderBase

The TxtReader class loads content from text files and wraps it into a list of DocNode objects.

This class inherits from LazyLLMReaderBase and mainly provides:

Support for reading files with a specified text encoding;
Optional tracing information of the loading process;

Parameters:

encoding (str, default: None ) –

Text encoding for reading files, default is 'utf-8'.
return_trace (bool, default: True ) –

Whether to return trace information of the loading process, default is True.

Source code in lazyllm/tools/rag/readers/readerBase.py

class TxtReader(LazyLLMReaderBase):
    """The TxtReader class loads content from text files and wraps it into a list of `DocNode` objects.

This class inherits from `LazyLLMReaderBase` and mainly provides:

- Support for reading files with a specified text encoding;
- Optional tracing information of the loading process;

Args:
    encoding (str): Text encoding for reading files, default is 'utf-8'.
    return_trace (bool): Whether to return trace information of the loading process, default is True.
"""
    def __init__(self, encoding: Optional[str] = None, return_trace: bool = True,
                 auto_detect_encoding: bool = config['auto_detect_encoding'],
                 enable_chardet: bool = config['enable_chardet'],
                 use_encoding_cache: bool = config['use_encoding_cache']) -> None:
        super().__init__(return_trace=return_trace)
        self._encoding = encoding
        self._auto_detect_encoding = auto_detect_encoding
        self._enable_chardet = enable_chardet
        self._use_encoding_cache = use_encoding_cache

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if self._encoding:
            encoding = self._encoding
        elif self._auto_detect_encoding:
            encoding = self.detect_encoding(
                file, fs,
                use_cache=self._use_encoding_cache,
                enable_chardet=self._enable_chardet
            )
        else:
            encoding = 'utf-8'

        try:
            with (fs or get_default_fs()).open(file, mode='r', encoding=encoding) as f:
                content = f.read()
            return [DocNode(text=content)]
        except Exception:
            if not self._auto_detect_encoding and self._encoding:
                try:
                    detected_encoding = self.detect_encoding(
                        file, fs,
                        use_cache=self._use_encoding_cache,
                        enable_chardet=self._enable_chardet
                    )
                    with (fs or get_default_fs()).open(file, mode='r', encoding=detected_encoding) as f:
                        content = f.read()
                    return [DocNode(text=content)]
                except Exception as e:
                    LOG.error(f'Auto-detection also failed for {file}: {e}')
            elif self._auto_detect_encoding and self._enable_chardet:
                try:
                    detected = charset_normalizer.from_path(file).best()
                    if detected and detected.encoding and detected.encoding.lower() != encoding.lower():
                        with (fs or get_default_fs()).open(file, mode='r', encoding=detected.encoding) as f:
                            content = f.read()
                        return [DocNode(text=content)]
                except Exception as e2:
                    LOG.error(f'charset_normalizer also failed for {file}: {e2}')
            raise

`lazyllm.tools.rag.component.bm25.BM25`

A BM25 retriever that uses the BM25 algorithm to retrieve nodes.

Source code in lazyllm/tools/rag/component/bm25.py

class BM25:
    """A BM25 retriever that uses the BM25 algorithm to retrieve nodes."""

    def __init__(
        self,
        nodes: List[DocNode],
        language: str = 'en',
        topk: int = 2,
        **kwargs,
    ) -> None:
        if language == 'en':
            self._stemmer = Stemmer.Stemmer('english')
            self._stopwords = language
            self._tokenizer = lambda t: t
        elif language == 'zh':
            self._stemmer = None
            # TODO(ywt): after bm25s supports cn stopwards, update this
            self._stopwords = STOPWORDS_CHINESE
            self._tokenizer = lambda t: ' '.join(jieba.lcut(t))
        self.topk = min(topk, len(nodes))
        self.nodes = nodes

        corpus_tokens = bm25s.tokenize(
            [self._tokenizer(node.get_text()) for node in nodes],
            stopwords=self._stopwords,
            stemmer=self._stemmer,
        )
        self.bm25 = bm25s.BM25()
        self.bm25.index(corpus_tokens)

    def retrieve(self, query: str, topk: Optional[int] = None) -> List[Tuple[DocNode, float]]:
        """Retrieve the most relevant document nodes for a query using BM25 algorithm.

Args:
    query (str): Query text.

**Returns:**

- List[Tuple[DocNode, float]]: Returns a list of tuples containing (document node, relevance score).
"""
        if topk is None:
            topk = self.topk
        else:
            topk = min(topk, len(self.nodes))
        tokenized_query = bm25s.tokenize(
            self._tokenizer(query), stopwords=self._stopwords, stemmer=self._stemmer
        )
        indexs, scores = self.bm25.retrieve(tokenized_query, k=topk)
        results = []
        for idx, score in zip(indexs[0], scores[0]):
            results.append((self.nodes[idx], score))
        return results

`retrieve(query, topk=None)`

Retrieve the most relevant document nodes for a query using BM25 algorithm.

Parameters:

query (str) –

Query text.

Returns:

List[Tuple[DocNode, float]]: Returns a list of tuples containing (document node, relevance score).

Source code in lazyllm/tools/rag/component/bm25.py

    def retrieve(self, query: str, topk: Optional[int] = None) -> List[Tuple[DocNode, float]]:
        """Retrieve the most relevant document nodes for a query using BM25 algorithm.

Args:
    query (str): Query text.

**Returns:**

- List[Tuple[DocNode, float]]: Returns a list of tuples containing (document node, relevance score).
"""
        if topk is None:
            topk = self.topk
        else:
            topk = min(topk, len(self.nodes))
        tokenized_query = bm25s.tokenize(
            self._tokenizer(query), stopwords=self._stopwords, stemmer=self._stemmer
        )
        indexs, scores = self.bm25.retrieve(tokenized_query, k=topk)
        results = []
        for idx, score in zip(indexs[0], scores[0]):
            results.append((self.nodes[idx], score))
        return results

`lazyllm.tools.rag.doc_to_db.SchemaExtractor`

Bases: ModuleBase

Schema aware extractor that materializes BaseModel schemas into database tables.

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

class SchemaExtractor(ModuleBase):
    """Schema aware extractor that materializes BaseModel schemas into database tables."""

    TABLE_PREFIX = 'lazyllm_schema'
    SYS_KB_ID = 'kb_id'
    SYS_DOC_ID = 'doc_id'

    TYPE_MAP = {
        str: sqlalchemy.Text,
        int: sqlalchemy.Integer,
        float: sqlalchemy.Float,
        bool: sqlalchemy.Boolean,
        list: sqlalchemy.JSON,
        dict: sqlalchemy.JSON,
    }
    TYPE_NAME_MAP = {
        'string': str,
        'text': str,
        'int': int,
        'integer': int,
        'float': float,
        'number': float,
        'boolean': bool,
        'bool': bool,
        'list': list,
        'array': list,
        'dict': dict,
        'object': dict,
        'map': dict,
    }

    def __init__(self, db_config: Dict[str, Any], llm: LLMBase, *, name: Optional[str] = None,
                 table_prefix: Optional[str] = None,
                 force_refresh: bool = False, extraction_mode: ExtractionMode = ExtractionMode.TEXT,
                 max_len: int = ONE_DOC_LENGTH_LIMIT, num_workers: int = 4):
        super().__init__()
        if not isinstance(llm, LLMBase):
            raise TypeError('llm must be an instance of LLMBase')
        self._name = name
        self._llm = llm
        self._table_prefix = table_prefix or self.TABLE_PREFIX
        self._sql_manager = None
        self._db_config = db_config
        self._table_cache: Dict[str, Type[_TableBase]] = {}
        self._schema_registry: Dict[str, Type[BaseModel]] = {}
        self._active_schema_set_id: Optional[str] = None
        self._force_refresh = force_refresh
        self._extraction_mode = extraction_mode
        self._max_len = max_len
        self._num_workers = num_workers

    @property
    def name(self) -> Optional[str]:
        return self._name

    @property
    def sql_manager(self) -> SqlManager:
        self._lazy_init()
        return self._sql_manager

    def sql_manager_for_nl2sql(self,  # noqa: C901
                               kb_ids: Union[str, List[str]] = None) -> SqlManager:
        """
Create a SqlManager tailored for NL2SQL in SqlCall Module that only exposes the table for the active schema set, with descriptions of columns and visible tables.

Args:
    kb_ids (Union[str, List[str]], optional): KB id or list to filter rows.

**Returns:**

- SqlManager: Manager instance with visible_tables and column metadata set for NL2SQL use.
"""
        self._lazy_init()
        if not self._sql_manager:
            raise ValueError('SqlManager is not initialized')
        if not self._db_config:
            raise ValueError('db_config is required to build SqlManager')
        if not self._active_schema_set_id:
            raise ValueError('No active schema set registered')

        schema_info_table = TABLE_SCHEMA_SET_INFO['name']
        desc_map: Dict[str, str] = {}

        def _schema_table_desc(model: Type[BaseModel]) -> str:
            schema_desc = self._get_schema_set_str(model)
            return '\n'.join([s for s in [
                (model.__doc__ or '').strip(),
                schema_desc,
                f'System columns: {self.SYS_KB_ID}, {self.SYS_DOC_ID}, extract_meta',
            ] if s])

        kb_id_list = None
        if kb_ids is not None:
            if isinstance(kb_ids, (list, tuple, set)):
                kb_id_list = [str(k) for k in kb_ids if k is not None]
            else:
                kb_id_list = [str(kb_ids)]
            if not kb_id_list:
                kb_id_list = None

        schema_set_id = self._active_schema_set_id
        if not self.has_schema_set(schema_set_id):
            raise ValueError(f'Schema set {schema_set_id} not found')
        schema_model = self._schema_registry[schema_set_id]
        table_name = self._ensure_table(schema_set_id, schema_model)
        target_tables = {table_name}
        desc_map[table_name] = _schema_table_desc(schema_model)

        target_tables.discard(schema_info_table)
        tables_info_dict = {'tables': []}
        for table_name in target_tables:
            table_cls = self._sql_manager.get_table_orm_class(table_name)
            if table_cls is None:
                continue
            columns = []
            for col in table_cls.__table__.columns:
                columns.append({
                    'name': col.name,
                    'data_type': _col_type_name(col),
                    'nullable': bool(col.nullable),
                    'is_primary_key': bool(col.primary_key),
                    'comment': getattr(col, 'comment', '') or '',
                })
            tables_info_dict['tables'].append({'name': table_name, 'columns': columns, 'comment': ''})
        new_manager = self._init_sql_manager({**self._db_config, 'tables_info_dict': tables_info_dict})
        new_manager.visible_tables = list(target_tables)
        if desc_map:
            new_manager.set_desc(desc_map)
        return new_manager

    @once_wrapper
    def _lazy_init(self):
        self._sql_manager = self._init_sql_manager(self._db_config) if self._db_config else None
        if self._sql_manager:
            self._ensure_management_tables()

    def register_schema_set(self, schema_set: Type[BaseModel], schema_set_id: str = None,   # noqa: C901
                            force_refresh: bool = False) -> str:
        """schema set registration, idempotent"""
        try:
            self._lazy_init()
            self._validate_schema_model(schema_set)

            fields = getattr(schema_set, 'model_fields', None) or getattr(schema_set, '__fields__', {})

            def _safe_default(val: Any):
                if val is None:
                    return None
                if val.__class__.__name__ in ('PydanticUndefinedType', 'UndefinedType'):
                    return None
                if isinstance(val, (str, int, float, bool)):
                    return val
                return str(val)

            signature = [
                (name, str(getattr(f, 'annotation', None) or getattr(f, 'outer_type_', None)),
                 _safe_default(getattr(f, 'default', None)), getattr(f, 'default_factory', None) is not None,
                 getattr(f, 'is_required', lambda: False)())
                for name, f in fields.items()
            ]
            signature.sort(key=lambda x: x[0])
            idem_key = json.dumps(signature, ensure_ascii=False)

            if self._sql_manager:
                table_cls = self._sql_manager.get_table_orm_class(TABLE_SCHEMA_SET_INFO['name'])
                if table_cls is None:
                    raise ValueError('Schema set table not initialized')
                with self._sql_manager.get_session() as session:
                    existing = session.query(table_cls).filter_by(idem_key=idem_key).first()
                    if existing:
                        existing_id = str(existing.schema_set_id if hasattr(existing, 'schema_set_id') else existing.id)
                        if schema_set_id and str(schema_set_id) != existing_id:
                            raise ValueError(
                                f'schema_set_id mismatch for idem_key, expect {existing_id}, got {schema_set_id}'
                            )
                        schema_set_id = schema_set_id or existing_id
                    else:
                        schema_json = (schema_set.model_json_schema() if hasattr(schema_set, 'model_json_schema')
                                       else schema_set.schema())
                        desc = (schema_set.__doc__ or '').strip() or 'Schema set'
                        obj_kwargs = dict(schema_set_json=json.dumps(schema_json, ensure_ascii=False),
                                          desc=desc, idem_key=idem_key, created_at=datetime.now(),
                                          updated_at=datetime.now())
                        if schema_set_id is None:
                            schema_set_id = str(uuid4().hex)
                        obj_kwargs['schema_set_id'] = str(schema_set_id)
                        new_obj = table_cls(**obj_kwargs)
                        session.add(new_obj)
                        session.flush()
                        schema_set_id = str(new_obj.schema_set_id if hasattr(new_obj, 'schema_set_id') else new_obj.id)

            if schema_set_id is None:
                raise ValueError('schema_set_id is required and could not be derived')

            schema_set_id = str(schema_set_id)
            self._schema_registry[schema_set_id] = schema_set
            if self._sql_manager:
                self._ensure_table(schema_set_id, schema_set)
            self._active_schema_set_id = schema_set_id
            return schema_set_id
        except Exception as e:
            LOG.error(f'Failed to register schema set: {e}')
            raise e

    def _model_from_schema_json(self, schema_json: str, model_name: str = 'RecoveredSchema') -> Type[BaseModel]:
        """Reconstruct a minimal BaseModel subclass from stored JSON schema."""
        try:
            schema_dict = json.loads(schema_json)
        except Exception as exc:
            raise ValueError(f'Invalid schema json: {exc}') from exc
        properties = schema_dict.get('properties', {})
        required = set(schema_dict.get('required', []) or [])
        type_map = {
            'string': str,
            'integer': int,
            'number': float,
            'boolean': bool,
            'array': list,
            'object': dict,
        }
        fields_def: Dict[str, Tuple[Any, Any]] = {}
        for name, prop in properties.items():
            t_name = prop.get('type')
            py_type = type_map.get(t_name, str)
            desc = prop.get('description', '')
            default = ... if name in required else None
            fields_def[name] = (py_type, Field(default=default, description=desc))
        return create_model(model_name, **fields_def)  # type: ignore[arg-type]

    def has_schema_set(self, schema_set_id: str) -> bool:
        """
Check whether a schema_set_id is registered, recovering the model and ensuring the table if needed.

Args:
    schema_set_id (str): Target schema set id.

**Returns:**

- bool: True if it exists.
"""
        self._lazy_init()
        if self._sql_manager:
            table_cls = self._sql_manager.get_table_orm_class(TABLE_SCHEMA_SET_INFO['name'])
            if table_cls is None:
                raise ValueError('Schema set table not initialized')
            with self._sql_manager.get_session() as session:
                existing = session.query(table_cls).filter_by(schema_set_id=schema_set_id).first()
                if not existing:
                    return False
                if schema_set_id not in self._schema_registry:
                    recovered_schema = self._model_from_schema_json(existing.schema_set_json,
                                                                    model_name=f'Schema_{schema_set_id}')
                    self._schema_registry[schema_set_id] = recovered_schema
                    self._ensure_table(schema_set_id, recovered_schema)
                return True
        return schema_set_id in self._schema_registry

    def _get_schema_set_str(self, schema_set) -> str:
        """Return a human readable schema description: name, description, data type."""
        model = None
        if isinstance(schema_set, str):
            model = self._schema_registry.get(schema_set)
        else:
            model = schema_set
        if not model:
            raise ValueError(f'Unknown schema_set: {schema_set}')
        fields = getattr(model, 'model_fields', None) or getattr(model, '__fields__', {})

        def _field_type_str(field_obj: Any) -> str:
            anno = getattr(field_obj, 'annotation', None) or getattr(field_obj, 'outer_type_', None)
            origin = get_origin(anno)
            args = get_args(anno)
            if origin is Union and args:
                non_none = [arg for arg in args if arg is not type(None)]  # noqa: E721
                anno = non_none[0] if non_none else anno
            return getattr(anno, '__name__', str(anno))

        lines: List[str] = []
        for name, field in fields.items():
            desc = getattr(field, 'description', None)
            if desc is None:
                field_info = getattr(field, 'field_info', None)
                desc = getattr(field_info, 'description', None) if field_info else None
            type_str = _field_type_str(field)
            lines.append(f"name: {name}, description: {desc or ''}, type: {type_str}")
        return '\n'.join(lines)

    def analyze_schema_and_register(self, data: Union[str, List[DocNode]],
                                    schema_set_id: Optional[str] = None) -> SchemaSetInfo:
        """Infer a schema from sample data, register it, and return the registration info."""
        self._lazy_init()
        if not self._llm:
            raise ValueError('LLM not initialized')
        if not data:
            raise ValueError('data is empty')

        if isinstance(data, str):
            sample_text = data[:self._max_len]
        else:
            chunks = self._gen_text_list_from_nodes(data)
            sample_text = '\n\n'.join(chunks)[:self._max_len] if chunks else ''
        if not sample_text:
            raise ValueError('No content available for schema analysis')

        llm = self._llm.share(prompt=SCHEMA_ANALYZE_PROMPT, format=JsonFormatter())
        payload = Template(SCHEMA_ANALYZE_INPUT_FORMAT).substitute(text=sample_text)
        res = llm(payload)
        fields_def: Dict[str, Tuple[Any, Any]] = {}
        for item in res:
            if not isinstance(item, dict):
                continue
            name = item.get('name')
            if not name:
                continue
            desc = item.get('description') or ''
            py_type = self._normalize_py_type(item.get('type'))
            fields_def[name] = (py_type, Field(default=None, description=desc))
        if not fields_def:
            # Fallback: single generic field capturing text content
            fields_def['content'] = (str, Field(default=None, description='Raw content snippet'))

        model_name = f'AutoSchema{uuid4().hex}'
        schema_model = create_model(model_name, **fields_def)  # type: ignore[arg-type]
        reg_id = self.register_schema_set(schema_model, schema_set_id)
        return SchemaSetInfo(schema_set_id=reg_id, schema_model=schema_model)

    def _gen_text_list_from_nodes(self, nodes: List[DocNode]) -> list[str]:
        """Generate full text blocks with metadata, each capped by `self._max_len`."""
        if not nodes:
            return []
        template = 'File Info:\n{file_metas}\nFile Content:\n{file_content}\n\n'
        metas = '\n'.join([f'{k}: {v}' for k, v in nodes[0].global_metadata.items()])

        # Reserve space for metadata and static prompt text.
        base_len = len(template.format(file_metas=metas, file_content=''))
        content_limit = max(self._max_len - base_len, 0)
        if content_limit == 0:
            return [template.format(file_metas=metas, file_content='')]

        chunks: List[str] = []
        current = ''
        for node in nodes:
            node_text = node.text
            sep_len = 1 if current else 0
            if len(current) + sep_len + len(node_text) <= content_limit:
                current = f'{current}\n{node_text}' if current else node_text
                continue

            if current:
                chunks.append(current)
            start = 0
            while start < len(node_text):
                end = start + content_limit
                chunks.append(node_text[start:end])
                start = end
            current = ''

        if current:
            chunks.append(current)

        return [template.format(file_metas=metas, file_content=chunk) for chunk in chunks]

    def _text_extract_impl(self, data: Union[str, List[DocNode]], schema_set_id: str) -> ExtractResult:  # noqa: C901
        if not self._llm:
            raise ValueError('LLM not initialized')
        schema_set = self._schema_registry.get(schema_set_id)
        llm = self._llm.share(prompt=SCHEMA_EXTRACT_PROMPT, format=JsonFormatter())
        content_list = self._gen_text_list_from_nodes(data) if isinstance(data, list) else [data]
        schema_str = self._get_schema_set_str(schema_set)
        input_list = [
            Template(SCHEMA_EXTRACT_INPUT_FORMAT).substitute(schema=schema_str, text=content)
            for content in content_list
        ]
        if self._num_workers > 1:
            pool = ThreadPoolExecutor(max_workers=self._num_workers)
            fs = [pool.submit(llm, text) for text in input_list]
            res = [f.result() for f in fs]
        else:
            res = [llm(text) for text in input_list]
        # process res by vote
        schema_val_clues: Dict[str, Dict[str, List[str]]] = {}
        for res_item in res:
            if not isinstance(res_item, list):
                LOG.error(f'[Schema Extractor - _text_extract_impl] invalid format {res_item}')
                continue
            for info in res_item:
                if not isinstance(info, dict):
                    continue
                schema_name = info.get('schema_name') or info.get('field_name')
                if not schema_name:
                    continue
                val_js = json.dumps(info.get('value'), ensure_ascii=False)
                if val_js is None:
                    continue
                clues = info.get('clues') or []
                schema_val_clues.setdefault(schema_name, {}).setdefault(val_js, []).extend(clues)

        data: Dict[str, Any] = {}
        clue_meta: Dict[str, ExtractClue] = {}
        for name, val_map in schema_val_clues.items():
            best_val_js = None
            best_clues: List[str] = []
            for v_js, clues in val_map.items():
                if len(clues) > len(best_clues):
                    best_val_js = v_js
                    best_clues = clues
            if best_val_js is None:
                continue
            try:
                best_val = json.loads(best_val_js)
            except Exception:
                best_val = best_val_js
            data[name] = best_val
            clue_meta[name] = ExtractClue(reason='selected_by_max_clues', citation=best_clues)

        meta = ExtractMeta(
            schema_set_id=schema_set_id,
            mode=self._extraction_mode,
            kb_id='',
            doc_id='',
            clues=clue_meta,
        )
        return [ExtractResult(data=data, metadata=meta)]

    def _multimodal_extract_impl(self, doc_nodes: List[DocNode], schemet_set_id: str) -> ExtractResult:
        # TODO: currently only support text extract
        raise NotImplementedError('Multimodal extract not implemented')

    def _schema_extract_impl(self, doc_nodes: List[DocNode]):
        raise NotImplementedError('Schema extract not implemented')

    def _validate_extract_params(self, data: Union[str, List[DocNode]]) -> Tuple[str, str, str]:
        self._lazy_init()
        if not data:
            raise ValueError('data is empty')
        if not self._active_schema_set_id:
            raise ValueError('No active schema set registered')
        kb_id = doc_id = None
        if isinstance(data, str):
            kb_id = DEFAULT_KB_ID
            doc_id = hashlib.sha256(data.encode('utf-8')).hexdigest()
        else:
            for node in data:
                meta = getattr(node, 'global_metadata', {}) or {}
                cur_kb_id = meta.get(RAG_KB_ID)
                cur_doc_id = meta.get(RAG_DOC_ID)
                if cur_kb_id is None or cur_doc_id is None:
                    raise ValueError('node.global_metadata must contain kb_id and doc_id')
                cur_kb_id = str(cur_kb_id)
                cur_doc_id = str(cur_doc_id)
                if kb_id is None:
                    kb_id = cur_kb_id
                elif kb_id != cur_kb_id:
                    raise ValueError('kb_id in data must be unique')
                if doc_id is None:
                    doc_id = cur_doc_id
                elif doc_id != cur_doc_id:
                    raise ValueError('doc_id in data must be unique')
        return kb_id, doc_id, self._active_schema_set_id

    def extract_and_store(self, data: Union[str, List[DocNode]],  # noqa: C901
                          schema_set_id: str = None, schema_set: Type[BaseModel] = None) -> ExtractResult:
        """
Extract content according to the registered schema and persist it; will register the provided schema_set if given; repeated calls for the same doc return cached results.

Args:
    data (Union[str, List[DocNode]]): Text or list of DocNodes from a single document.
    schema_set_id (str, optional): Schema set id to use.
    schema_set (Type[BaseModel], optional): Schema to register and use if no id is provided.

**Returns:**

- ExtractResult: Result object where `data` is the field/value dict and `metadata` contains schema_set_id, kb_id, doc_id, and field-level clues; or None if nothing persisted.
"""
        self._lazy_init()
        if schema_set is not None:
            schema_set_id = self.register_schema_set(schema_set, schema_set_id)
        if schema_set_id and not self.has_schema_set(schema_set_id):
            raise ValueError(f'schema_set_id {schema_set_id} not found')
        if not isinstance(data, (str, list)):
            raise TypeError(f'data must be a string or a list of DocNode, got {type(data)}')
        if isinstance(data, list) and any(not isinstance(n, DocNode) for n in data):
            raise TypeError('data list must contain DocNode instances')
        kb_id, doc_id, active_set_id = self._validate_extract_params(data)
        schema_set_id = schema_set_id or active_set_id
        search_res = self._get_extract_data(kb_id=kb_id, doc_ids=[doc_id])
        if search_res: return search_res[0]
        if schema_set_id not in self._schema_registry:
            raise ValueError(f'Unknown schema_set_id: {schema_set_id}')
        if self._extraction_mode == ExtractionMode.TEXT:
            res = self._text_extract_impl(data, schema_set_id)
        elif self._extraction_mode == ExtractionMode.MULTIMODAL:
            res = self._multimodal_extract_impl(data, schema_set_id)
        else:
            raise ValueError(f'Unknown extraction mode: {self._extraction_mode}')
        if not res:
            return None
        res_item = res[0] if isinstance(res, list) else res
        res_item.metadata.kb_id = kb_id
        res_item.metadata.doc_id = doc_id

        schema_model = self._schema_registry[schema_set_id]
        table_name = self._ensure_table(schema_set_id, schema_model)
        table_cls = self._sql_manager.get_table_orm_class(table_name)
        if table_cls is None:
            raise ValueError(f'Target table {table_name} not initialized')
        payload = {
            self.SYS_KB_ID: kb_id,
            self.SYS_DOC_ID: doc_id,
        }
        payload.update(self._to_model_dict(res_item.data, schema_model))
        meta_obj = getattr(res_item, 'metadata', None) or {}
        if isinstance(meta_obj, BaseModel):
            try:
                meta_payload = meta_obj.model_dump(mode='json')
            except AttributeError:
                meta_payload = meta_obj.dict(use_enum_values=True)
        elif isinstance(meta_obj, dict):
            meta_payload = self._json_safe(meta_obj)
        else:
            meta_payload = {}
        payload['extract_meta'] = self._json_safe(meta_payload)

        with self._sql_manager.get_session() as session:
            session.query(table_cls).filter_by(
                **{self.SYS_KB_ID: kb_id, self.SYS_DOC_ID: doc_id}
            ).delete()
            session.add(table_cls(**payload))
        return res_item

    def _delete_extract_data(self, doc_ids: List[str], kb_id: str = None) -> bool:
        try:
            self._lazy_init()
            if not self._sql_manager:
                raise ValueError('SqlManager is not initialized')
            if not doc_ids:
                return True
            if not self._active_schema_set_id:
                return True

            kb_id = kb_id or DEFAULT_KB_ID
            doc_ids = [str(d) for d in doc_ids]

            schema_set_id = self._active_schema_set_id
            table_name = self._table_name(schema_set_id)
            table_cls = self._sql_manager.get_table_orm_class(table_name)
            if table_cls is None:
                return True

            with self._sql_manager.get_session() as session:
                session.query(table_cls).filter_by(
                    **{self.SYS_KB_ID: kb_id}
                ).filter(
                    table_cls.doc_id.in_(doc_ids)
                ).delete(synchronize_session=False)
            return True
        except Exception as e:
            LOG.error(f'Failed to delete doc_ids={doc_ids} from kb_id={kb_id}', e)
            return False

    def _get_extract_data(self, doc_ids: List[str],  # noqa: C901
                          kb_id: str = None) -> List[ExtractResult]:
        self._lazy_init()
        if not self._sql_manager:
            raise ValueError('SqlManager is not initialized')
        if not doc_ids:
            return []
        if not self._active_schema_set_id:
            return []

        schema_set_id = self._active_schema_set_id
        self.has_schema_set(schema_set_id)
        table_name = self._table_name(schema_set_id)
        table_cls = self._sql_manager.get_table_orm_class(table_name)
        if table_cls is None:
            return []

        schema_model = self._schema_registry.get(schema_set_id)
        with self._sql_manager.get_session() as session:
            rows = session.query(table_cls).filter_by(
                **{self.SYS_KB_ID: kb_id}
            ).filter(
                table_cls.doc_id.in_(doc_ids)
            ).all()

        results: List[ExtractResult] = []
        sys_fields = {self.SYS_KB_ID, self.SYS_DOC_ID, 'extract_meta'}
        for row in rows:
            row_data = {}
            for col in table_cls.__table__.columns:
                name = col.name
                if name in sys_fields:
                    continue
                row_data[name] = getattr(row, name)
            if schema_model:
                try:
                    row_data = self._to_model_dict(row_data, schema_model)
                except Exception:
                    pass

            meta_payload = getattr(row, 'extract_meta', {}) or {}
            if not isinstance(meta_payload, dict):
                try:
                    meta_payload = json.loads(meta_payload)
                except Exception:
                    meta_payload = {}
            meta_payload = meta_payload if isinstance(meta_payload, dict) else {}
            meta_payload.setdefault('schema_set_id', schema_set_id)
            meta_payload.setdefault('kb_id', kb_id)
            meta_payload.setdefault('doc_id', str(getattr(row, self.SYS_DOC_ID, '')))
            try:
                meta = ExtractMeta(**meta_payload)
            except Exception:
                meta = ExtractMeta(schema_set_id=schema_set_id, kb_id=kb_id,
                                   doc_id=str(getattr(row, self.SYS_DOC_ID, '')))
            results.append(ExtractResult(data=row_data, metadata=meta))
        return results

    def forward(self, data: Union[str, List[DocNode]]) -> ExtractResult:
        self._lazy_init()
        res = self.extract_and_store(data=data)
        LOG.info(f'[Schema Extractor] extract res: {res}')
        return res

    def _init_sql_manager(self, db_config: Dict[str, Any]) -> SqlManager:
        return SqlManager(**db_config)

    def _table_name(self, schema_set_id: str) -> str:
        return f'{self._table_prefix}_{schema_set_id}'

    def _ensure_management_tables(self) -> None:
        tables_info_dict = {'tables': [TABLE_SCHEMA_SET_INFO]}
        try:
            self._sql_manager._init_tables_by_info(tables_info_dict)
        except Exception as e:
            LOG.warning(f'Ensure management tables failed: {e}')

    def _ensure_table(self, schema_set_id: str, schema_model: Optional[Type[BaseModel]] = None) -> str:
        if not self._sql_manager:
            raise ValueError('SqlManager is not initialized')
        table_name = self._table_name(schema_set_id)
        if table_name in self._table_cache:
            return table_name
        if schema_model is None:
            schema_model = self._schema_registry.get(schema_set_id)
        if schema_model is None:
            raise ValueError(f'No schema model registered for {schema_set_id}')

        attrs: Dict[str, Any] = {
            '__tablename__': table_name,
            '__table_args__': (
                sqlalchemy.Index(f'idx_{table_name}_kb', self.SYS_KB_ID),
                {'extend_existing': True},
            ),
        }
        attrs[self.SYS_KB_ID] = sqlalchemy.Column(sqlalchemy.String(128), primary_key=True, nullable=False)
        attrs[self.SYS_DOC_ID] = sqlalchemy.Column(sqlalchemy.String(128), primary_key=True, nullable=False)
        attrs['extract_meta'] = sqlalchemy.Column(sqlalchemy.JSON, nullable=True)

        for field_name, field_type in self._iter_schema_fields(schema_model):
            if field_name in attrs:
                continue
            attrs[field_name] = sqlalchemy.Column(field_type, nullable=True)

        table_cls = type(table_name.capitalize(), (_TableBase,), attrs)
        db_result = self._sql_manager.create_table(table_cls)
        if db_result.status != DBStatus.SUCCESS:
            LOG.warning(f'Create table failed: {db_result.detail}')
        else:
            self._table_cache[table_name] = table_cls
        return table_name

    def _iter_schema_fields(self, model: Type[BaseModel]) -> List[tuple[str, Any]]:
        try:
            fields = model.model_fields  # pydantic v2
        except AttributeError:
            fields = model.__fields__  # type: ignore[attr-defined]  # pydantic v1
        result = []
        for name, field in fields.items():
            annotation = getattr(field, 'annotation', None) or getattr(field, 'outer_type_', None)
            result.append((name, self._column_type(annotation)))
        return result

    def _normalize_py_type(self, type_hint: Any):
        if isinstance(type_hint, str):
            return self.TYPE_NAME_MAP.get(type_hint.lower(), str)
        return type_hint or str

    def _column_type(self, annotation: Any):
        origin = get_origin(annotation)
        args = get_args(annotation)
        if origin is Union and args:
            non_none = [arg for arg in args if arg is not type(None)]  # noqa: E721
            annotation = non_none[0] if non_none else str
            origin = get_origin(annotation)
        if origin in (list, set, tuple):
            return sqlalchemy.JSON
        resolved = self._normalize_py_type(annotation)
        if resolved in self.TYPE_MAP:
            return self.TYPE_MAP[resolved]
        if resolved in (list, set, tuple):
            return sqlalchemy.JSON
        return sqlalchemy.Text

    def _to_model_dict(self, payload: Union[BaseModel, Dict[str, Any]], model_cls: Type[BaseModel]) -> Dict[str, Any]:
        if isinstance(payload, BaseModel):
            try:
                return payload.model_dump()
            except AttributeError:
                return payload.dict()
        validated = model_cls(**payload)
        try:
            return validated.model_dump()
        except AttributeError:
            return validated.dict()

    def _validate_schema_model(self, model: Type[BaseModel]) -> None:
        if not model or not issubclass(model, BaseModel):
            raise TypeError('schema_set must be a pydantic BaseModel subclass')

    def _json_safe(self, obj: Any) -> Any:
        """Convert common objects (Enum/BaseModel) to JSON-serializable primitives."""
        if isinstance(obj, Enum):
            return obj.value
        if isinstance(obj, BaseModel):
            try:
                return obj.model_dump(mode='json')
            except AttributeError:
                return obj.dict(use_enum_values=True)
        if isinstance(obj, dict):
            return {k: self._json_safe(v) for k, v in obj.items()}
        if isinstance(obj, (list, tuple, set)):
            return [self._json_safe(v) for v in obj]
        return obj

`analyze_schema_and_register(data, schema_set_id=None)`

Infer a schema from sample data, register it, and return the registration info.

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

def analyze_schema_and_register(self, data: Union[str, List[DocNode]],
                                schema_set_id: Optional[str] = None) -> SchemaSetInfo:
    """Infer a schema from sample data, register it, and return the registration info."""
    self._lazy_init()
    if not self._llm:
        raise ValueError('LLM not initialized')
    if not data:
        raise ValueError('data is empty')

    if isinstance(data, str):
        sample_text = data[:self._max_len]
    else:
        chunks = self._gen_text_list_from_nodes(data)
        sample_text = '\n\n'.join(chunks)[:self._max_len] if chunks else ''
    if not sample_text:
        raise ValueError('No content available for schema analysis')

    llm = self._llm.share(prompt=SCHEMA_ANALYZE_PROMPT, format=JsonFormatter())
    payload = Template(SCHEMA_ANALYZE_INPUT_FORMAT).substitute(text=sample_text)
    res = llm(payload)
    fields_def: Dict[str, Tuple[Any, Any]] = {}
    for item in res:
        if not isinstance(item, dict):
            continue
        name = item.get('name')
        if not name:
            continue
        desc = item.get('description') or ''
        py_type = self._normalize_py_type(item.get('type'))
        fields_def[name] = (py_type, Field(default=None, description=desc))
    if not fields_def:
        # Fallback: single generic field capturing text content
        fields_def['content'] = (str, Field(default=None, description='Raw content snippet'))

    model_name = f'AutoSchema{uuid4().hex}'
    schema_model = create_model(model_name, **fields_def)  # type: ignore[arg-type]
    reg_id = self.register_schema_set(schema_model, schema_set_id)
    return SchemaSetInfo(schema_set_id=reg_id, schema_model=schema_model)

`extract_and_store(data, schema_set_id=None, schema_set=None)`

Extract content according to the registered schema and persist it; will register the provided schema_set if given; repeated calls for the same doc return cached results.

Parameters:

data (Union[str, List[DocNode]]) –

Text or list of DocNodes from a single document.
schema_set_id (str, default: None ) –

Schema set id to use.
schema_set (Type[BaseModel], default: None ) –

Schema to register and use if no id is provided.

Returns:

ExtractResult: Result object where data is the field/value dict and metadata contains schema_set_id, kb_id, doc_id, and field-level clues; or None if nothing persisted.

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

    def extract_and_store(self, data: Union[str, List[DocNode]],  # noqa: C901
                          schema_set_id: str = None, schema_set: Type[BaseModel] = None) -> ExtractResult:
        """
Extract content according to the registered schema and persist it; will register the provided schema_set if given; repeated calls for the same doc return cached results.

Args:
    data (Union[str, List[DocNode]]): Text or list of DocNodes from a single document.
    schema_set_id (str, optional): Schema set id to use.
    schema_set (Type[BaseModel], optional): Schema to register and use if no id is provided.

**Returns:**

- ExtractResult: Result object where `data` is the field/value dict and `metadata` contains schema_set_id, kb_id, doc_id, and field-level clues; or None if nothing persisted.
"""
        self._lazy_init()
        if schema_set is not None:
            schema_set_id = self.register_schema_set(schema_set, schema_set_id)
        if schema_set_id and not self.has_schema_set(schema_set_id):
            raise ValueError(f'schema_set_id {schema_set_id} not found')
        if not isinstance(data, (str, list)):
            raise TypeError(f'data must be a string or a list of DocNode, got {type(data)}')
        if isinstance(data, list) and any(not isinstance(n, DocNode) for n in data):
            raise TypeError('data list must contain DocNode instances')
        kb_id, doc_id, active_set_id = self._validate_extract_params(data)
        schema_set_id = schema_set_id or active_set_id
        search_res = self._get_extract_data(kb_id=kb_id, doc_ids=[doc_id])
        if search_res: return search_res[0]
        if schema_set_id not in self._schema_registry:
            raise ValueError(f'Unknown schema_set_id: {schema_set_id}')
        if self._extraction_mode == ExtractionMode.TEXT:
            res = self._text_extract_impl(data, schema_set_id)
        elif self._extraction_mode == ExtractionMode.MULTIMODAL:
            res = self._multimodal_extract_impl(data, schema_set_id)
        else:
            raise ValueError(f'Unknown extraction mode: {self._extraction_mode}')
        if not res:
            return None
        res_item = res[0] if isinstance(res, list) else res
        res_item.metadata.kb_id = kb_id
        res_item.metadata.doc_id = doc_id

        schema_model = self._schema_registry[schema_set_id]
        table_name = self._ensure_table(schema_set_id, schema_model)
        table_cls = self._sql_manager.get_table_orm_class(table_name)
        if table_cls is None:
            raise ValueError(f'Target table {table_name} not initialized')
        payload = {
            self.SYS_KB_ID: kb_id,
            self.SYS_DOC_ID: doc_id,
        }
        payload.update(self._to_model_dict(res_item.data, schema_model))
        meta_obj = getattr(res_item, 'metadata', None) or {}
        if isinstance(meta_obj, BaseModel):
            try:
                meta_payload = meta_obj.model_dump(mode='json')
            except AttributeError:
                meta_payload = meta_obj.dict(use_enum_values=True)
        elif isinstance(meta_obj, dict):
            meta_payload = self._json_safe(meta_obj)
        else:
            meta_payload = {}
        payload['extract_meta'] = self._json_safe(meta_payload)

        with self._sql_manager.get_session() as session:
            session.query(table_cls).filter_by(
                **{self.SYS_KB_ID: kb_id, self.SYS_DOC_ID: doc_id}
            ).delete()
            session.add(table_cls(**payload))
        return res_item

`has_schema_set(schema_set_id)`

Check whether a schema_set_id is registered, recovering the model and ensuring the table if needed.

Parameters:

schema_set_id (str) –

Target schema set id.

Returns:

bool: True if it exists.

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

    def has_schema_set(self, schema_set_id: str) -> bool:
        """
Check whether a schema_set_id is registered, recovering the model and ensuring the table if needed.

Args:
    schema_set_id (str): Target schema set id.

**Returns:**

- bool: True if it exists.
"""
        self._lazy_init()
        if self._sql_manager:
            table_cls = self._sql_manager.get_table_orm_class(TABLE_SCHEMA_SET_INFO['name'])
            if table_cls is None:
                raise ValueError('Schema set table not initialized')
            with self._sql_manager.get_session() as session:
                existing = session.query(table_cls).filter_by(schema_set_id=schema_set_id).first()
                if not existing:
                    return False
                if schema_set_id not in self._schema_registry:
                    recovered_schema = self._model_from_schema_json(existing.schema_set_json,
                                                                    model_name=f'Schema_{schema_set_id}')
                    self._schema_registry[schema_set_id] = recovered_schema
                    self._ensure_table(schema_set_id, recovered_schema)
                return True
        return schema_set_id in self._schema_registry

`register_schema_set(schema_set, schema_set_id=None, force_refresh=False)`

schema set registration, idempotent

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

def register_schema_set(self, schema_set: Type[BaseModel], schema_set_id: str = None,   # noqa: C901
                        force_refresh: bool = False) -> str:
    """schema set registration, idempotent"""
    try:
        self._lazy_init()
        self._validate_schema_model(schema_set)

        fields = getattr(schema_set, 'model_fields', None) or getattr(schema_set, '__fields__', {})

        def _safe_default(val: Any):
            if val is None:
                return None
            if val.__class__.__name__ in ('PydanticUndefinedType', 'UndefinedType'):
                return None
            if isinstance(val, (str, int, float, bool)):
                return val
            return str(val)

        signature = [
            (name, str(getattr(f, 'annotation', None) or getattr(f, 'outer_type_', None)),
             _safe_default(getattr(f, 'default', None)), getattr(f, 'default_factory', None) is not None,
             getattr(f, 'is_required', lambda: False)())
            for name, f in fields.items()
        ]
        signature.sort(key=lambda x: x[0])
        idem_key = json.dumps(signature, ensure_ascii=False)

        if self._sql_manager:
            table_cls = self._sql_manager.get_table_orm_class(TABLE_SCHEMA_SET_INFO['name'])
            if table_cls is None:
                raise ValueError('Schema set table not initialized')
            with self._sql_manager.get_session() as session:
                existing = session.query(table_cls).filter_by(idem_key=idem_key).first()
                if existing:
                    existing_id = str(existing.schema_set_id if hasattr(existing, 'schema_set_id') else existing.id)
                    if schema_set_id and str(schema_set_id) != existing_id:
                        raise ValueError(
                            f'schema_set_id mismatch for idem_key, expect {existing_id}, got {schema_set_id}'
                        )
                    schema_set_id = schema_set_id or existing_id
                else:
                    schema_json = (schema_set.model_json_schema() if hasattr(schema_set, 'model_json_schema')
                                   else schema_set.schema())
                    desc = (schema_set.__doc__ or '').strip() or 'Schema set'
                    obj_kwargs = dict(schema_set_json=json.dumps(schema_json, ensure_ascii=False),
                                      desc=desc, idem_key=idem_key, created_at=datetime.now(),
                                      updated_at=datetime.now())
                    if schema_set_id is None:
                        schema_set_id = str(uuid4().hex)
                    obj_kwargs['schema_set_id'] = str(schema_set_id)
                    new_obj = table_cls(**obj_kwargs)
                    session.add(new_obj)
                    session.flush()
                    schema_set_id = str(new_obj.schema_set_id if hasattr(new_obj, 'schema_set_id') else new_obj.id)

        if schema_set_id is None:
            raise ValueError('schema_set_id is required and could not be derived')

        schema_set_id = str(schema_set_id)
        self._schema_registry[schema_set_id] = schema_set
        if self._sql_manager:
            self._ensure_table(schema_set_id, schema_set)
        self._active_schema_set_id = schema_set_id
        return schema_set_id
    except Exception as e:
        LOG.error(f'Failed to register schema set: {e}')
        raise e

`sql_manager_for_nl2sql(kb_ids=None)`

Create a SqlManager tailored for NL2SQL in SqlCall Module that only exposes the table for the active schema set, with descriptions of columns and visible tables.

Parameters:

kb_ids (Union[str, List[str]], default: None ) –

KB id or list to filter rows.

Returns:

SqlManager: Manager instance with visible_tables and column metadata set for NL2SQL use.

Source code in lazyllm/tools/rag/doc_to_db/extractor.py

    def sql_manager_for_nl2sql(self,  # noqa: C901
                               kb_ids: Union[str, List[str]] = None) -> SqlManager:
        """
Create a SqlManager tailored for NL2SQL in SqlCall Module that only exposes the table for the active schema set, with descriptions of columns and visible tables.

Args:
    kb_ids (Union[str, List[str]], optional): KB id or list to filter rows.

**Returns:**

- SqlManager: Manager instance with visible_tables and column metadata set for NL2SQL use.
"""
        self._lazy_init()
        if not self._sql_manager:
            raise ValueError('SqlManager is not initialized')
        if not self._db_config:
            raise ValueError('db_config is required to build SqlManager')
        if not self._active_schema_set_id:
            raise ValueError('No active schema set registered')

        schema_info_table = TABLE_SCHEMA_SET_INFO['name']
        desc_map: Dict[str, str] = {}

        def _schema_table_desc(model: Type[BaseModel]) -> str:
            schema_desc = self._get_schema_set_str(model)
            return '\n'.join([s for s in [
                (model.__doc__ or '').strip(),
                schema_desc,
                f'System columns: {self.SYS_KB_ID}, {self.SYS_DOC_ID}, extract_meta',
            ] if s])

        kb_id_list = None
        if kb_ids is not None:
            if isinstance(kb_ids, (list, tuple, set)):
                kb_id_list = [str(k) for k in kb_ids if k is not None]
            else:
                kb_id_list = [str(kb_ids)]
            if not kb_id_list:
                kb_id_list = None

        schema_set_id = self._active_schema_set_id
        if not self.has_schema_set(schema_set_id):
            raise ValueError(f'Schema set {schema_set_id} not found')
        schema_model = self._schema_registry[schema_set_id]
        table_name = self._ensure_table(schema_set_id, schema_model)
        target_tables = {table_name}
        desc_map[table_name] = _schema_table_desc(schema_model)

        target_tables.discard(schema_info_table)
        tables_info_dict = {'tables': []}
        for table_name in target_tables:
            table_cls = self._sql_manager.get_table_orm_class(table_name)
            if table_cls is None:
                continue
            columns = []
            for col in table_cls.__table__.columns:
                columns.append({
                    'name': col.name,
                    'data_type': _col_type_name(col),
                    'nullable': bool(col.nullable),
                    'is_primary_key': bool(col.primary_key),
                    'comment': getattr(col, 'comment', '') or '',
                })
            tables_info_dict['tables'].append({'name': table_name, 'columns': columns, 'comment': ''})
        new_manager = self._init_sql_manager({**self._db_config, 'tables_info_dict': tables_info_dict})
        new_manager.visible_tables = list(target_tables)
        if desc_map:
            new_manager.set_desc(desc_map)
        return new_manager

`lazyllm.tools.rag.readers.DocxReader`

Bases: _RichReader

A docx format file parser, reading text content from a .docx file and return a list of DocNode objects.

Parameters:

file (Path) –

Path to the .docx file.
fs (Optional[AbstractFileSystem]) –

Optional file system object for custom reading.

Returns:

List[DocNode]: A list containing the extracted text content as DocNode instances.

Source code in lazyllm/tools/rag/readers/docxReader.py

class DocxReader(_RichReader):
    """A docx format file parser, reading text content from a `.docx` file and return a list of `DocNode` objects.

Args:
    file (Path): Path to the `.docx` file.
    fs (Optional[AbstractFileSystem]): Optional file system object for custom reading.

**Returns:**

- List[DocNode]: A list containing the extracted text content as `DocNode` instances.
"""
    def __init__(self, split_doc: Optional[bool] = False, extra_info: Optional[Dict] = None,
                 extract_process: Optional[Callable] = None, post_func: Optional[Callable] = None,
                 extract_global_info: bool = True, image_save_path: Optional[str] = None,
                 save_image: bool = True, return_trace: bool = True):
        super().__init__(split_doc=split_doc, return_trace=return_trace, post_func=None)
        self._post_func = post_func or self._default_post
        self.extract_process = extract_process or self._default_extract
        self.extract_global_info = extract_global_info
        self._extra_info = extra_info or {}
        self._image_save_path = image_save_path
        self._save_image = save_image

    def _extract_global_info(self, doc: 'docx.Document', file_path: Path) -> Dict[str, Any]:
        global_info = dict(self._extra_info)

        if self.extract_global_info and self._split_doc:
            try:
                props = doc.core_properties

                str_props = ['author', 'title', 'subject', 'keywords', 'comments']

                special_props = {
                    'created': lambda x: x.isoformat(),
                    'modified': lambda x: x.isoformat(),
                    'revision': lambda x: x,
                }

                for prop_name in str_props:
                    prop_value = getattr(props, prop_name, None)
                    if prop_value:
                        global_info[prop_name] = str(prop_value)

                for prop_name, converter in special_props.items():
                    prop_value = getattr(props, prop_name, None)
                    if prop_value is not None:
                        try:
                            global_info[prop_name] = converter(prop_value)
                        except (AttributeError, ValueError) as e:
                            LOG.debug(f'Failed to convert {prop_name}: {e}')

                global_info['file_path'] = str(file_path)
                global_info['file_name'] = file_path.name
                global_info['file_size'] = file_path.stat().st_size if file_path.exists() else 0

            except Exception as e:
                LOG.warning(f'Failed to extract global info from {file_path}: {e}')

        return global_info

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None,
                   **kwargs) -> List[DocNode]:
        if not isinstance(file, Path):
            file = Path(file)

        if file.name.endswith('.doc'):
            raise ValueError(f'Only expected docx file, but got {file.name}')

        if self._split_doc:
            try:
                return self._enhanced_load(file, fs, **kwargs)

            except Exception:
                try:
                    return self._load(file, fs, **kwargs)
                except Exception as e:
                    raise e
        return self._load(file, fs, **kwargs)

    def _load(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None, **kwargs) -> List[DocNode]:
        try:
            if fs:
                with fs.open(file) as f:
                    text = docx2txt.process(f)
            else:
                text = docx2txt.process(file)
            if not text:
                raise ValueError(f"Fail loading file {file.name}, maybe it's empty")
            return [DocNode(text=text)]
        except Exception as docx2txt_error:
            LOG.error(f'Failed for {file}: {str(docx2txt_error)}')
            raise

    def _enhanced_load(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None, **kwargs) -> List[DocNode]:
        with pipeline() as p:
            p.f1 = self._read_file
            p.f2 = bind(self.extract_process, file, _0)
            p.f3 = bind(self._post_func, _0, **kwargs)

        nodes = p(file, fs)
        return nodes

    def _read_file(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> 'docx.Document':
        fs = fs or get_default_fs()

        try:
            file_size = fs.size(file)
            if file_size == 0:
                raise ValueError(f'Input file {file.name} is empty')

        except Exception as e:
            LOG.error(f'Fail to load file for {file}: {e}')
            raise e

        temp_files_to_cleanup = []
        temp_path = None

        try:
            if is_default_fs(fs):
                doc = docx.Document(docx=str(file))
            else:
                with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as tmp_file:
                    temp_path = tmp_file.name

                with fs.open(file, 'rb') as remote_file:
                    tmp_file.write(remote_file.read())

                doc = docx.Document(docx=temp_path)
            return doc

        except Exception as e:
            LOG.error(f'[ERROR] file--{file.name}--Wrong file, failed to read file: {e}')
            raise
        finally:
            for temp_file in temp_files_to_cleanup:
                try:
                    if os.path.exists(temp_file):
                        os.unlink(temp_file)
                except Exception as e:
                    LOG.warning(f'Failed to clean up temporary file {temp_file}: {e}')

    def _default_extract(self, file, doc) -> List[DocNode]:  # noqa: C901
        global_info = self._extract_global_info(doc, file)

        base_metadata = {'file_name': file.name}

        doc_list = []

        paragraphs = list(doc.paragraphs)
        tables = list(doc.tables)
        paragraph_idx = 0
        table_idx = 0

        elements = list(doc.element.body)
        for element in elements:
            if element.tag.endswith('tbl'):
                if table_idx < len(tables):
                    table = tables[table_idx]
                    table_idx += 1

                    table_node = self._process_table(table, base_metadata, global_info)
                    doc_list.append(table_node)

            elif element.tag.endswith('p'):
                if paragraph_idx >= len(paragraphs):
                    continue

                para = paragraphs[paragraph_idx]
                paragraph_idx += 1

                has_image = False
                for run in para.runs:
                    has_drawing = run.element.xpath('.//*[local-name()="drawing"]')
                    has_imagedata = run.element.xpath('.//*[local-name()="imagedata"]')
                    if has_drawing or has_imagedata:
                        has_image = True

                    if has_image:
                        image_nodes = self._extract_images_from_paragraph(
                            para, doc, base_metadata, global_info
                        )
                        if image_nodes:
                            doc_list.extend(image_nodes)

                        if self._content_clean(element):
                            continue

                    math_text = self._extract_math_from_element(element)
                    if math_text:
                        math_node = self._process_math(math_text, base_metadata, global_info)
                        doc_list.append(math_node)
                        if self._content_clean(element):
                            continue

                    if self._content_clean(element):
                        continue

                    content = element.text.replace('\u3000', ' ').strip('\n') if element.text else ''

                    para_node = self._process_paragraph(para, content, base_metadata, global_info)
                    doc_list.append(para_node)
        if not doc_list:
            raise ValueError('file Extraction failed')
        return doc_list

    def _default_post(self, doc_list, **kwargs) -> List[DocNode]:
        for index, node in enumerate(doc_list):
            node.metadata['index'] = index

        for node in doc_list:
            node.excluded_embed_metadata_keys = ['style_dict', 'type', 'index', 'text_level', 'lines']
            node.excluded_llm_metadata_keys = ['style_dict', 'type', 'index', 'text_level', 'lines']

        return doc_list

    def _content_clean(self, element) -> bool:
        content = element.text.replace('\u3000', ' ').strip('\n') if element.text else ''
        content_clean = content.replace(' ', '').replace('\n', '')

        return not content_clean

    def _get_aligned_type(self, para: 'docx.text.paragraph') -> str:
        aligned_type = {
            docx.enum.text.WD_ALIGN_PARAGRAPH.LEFT: 'left',
            docx.enum.text.WD_ALIGN_PARAGRAPH.CENTER: 'center',
            docx.enum.text.WD_ALIGN_PARAGRAPH.RIGHT: 'right',
            docx.enum.text.WD_ALIGN_PARAGRAPH.JUSTIFY: 'both_ends',
            docx.enum.text.WD_ALIGN_PARAGRAPH.DISTRIBUTE: 'distribute',
            None: '',
        }

        try:
            alignment = para.alignment
        except (AttributeError, TypeError):
            alignment = None

        return aligned_type.get(alignment, '')

    def _get_style_info(self, style: 'docx.styles.style.ParagraphStyle') -> dict:
        try:
            font = style.font
            style_dict = {
                'style_name': style.name,
                'style_type': style.type,
                'font_name': font.name if font.name else None,
                'font_bold': bool(font.bold) if font.bold is not None else False,
                'font_size': font.size.pt if font.size else None,
            }
        except Exception:
            style_dict = {
                'style_name': style.name if style else '',
                'style_type': style.type if style else None,
                'font_name': None,
                'font_bold': False,
                'font_size': None,
            }
        return style_dict

    def _extract_images_from_paragraph(self, para, doc: 'docx.Document', base_metadata: dict,
                                       extra_info: Optional[Dict]) -> List[DocNode]:
        image_nodes = []

        for run in para.runs:
            has_drawing = run.element.xpath('.//*[local-name()="drawing"]')
            has_imagedata = run.element.xpath('.//*[local-name()="imagedata"]')

            if not (has_drawing or has_imagedata):
                continue

            for shape in run.element.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}blip'):
                embed_id = shape.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
                if not embed_id:
                    continue

                try:
                    image_part = doc.part.related_parts[embed_id]

                    if self._save_image:
                        image_data = image_part.blob
                        original_filename = os.path.basename(image_part.partname)
                        file_extension = os.path.splitext(original_filename)[1] or '.png'
                        image_filename = f'{uuid.uuid4()}{file_extension}'

                        try:
                            os.makedirs(self._image_save_path, exist_ok=True)
                        except Exception:
                            LOG.warning('use default image save path ~/.lazyllm/image')
                            image_path = os.path.join(os.path.expanduser('~'), '.lazyllm')
                            self._image_save_path = Path(image_path) / 'image'
                            continue

                        image_save_path = os.path.join(self._image_save_path, image_filename)
                        with open(image_save_path, 'wb') as img_file:
                            img_file.write(image_data)
                    else:
                        original_filename = os.path.basename(image_part.partname)
                        file_extension = os.path.splitext(original_filename)[1] or '.png'
                        image_filename = f'image_{uuid.uuid4()}{file_extension}'

                    if self._save_image:
                        image_text = f'![]({image_filename})'
                    else:
                        image_text = ''

                    metadata = copy.deepcopy(base_metadata)
                    metadata['type'] = 'image'

                    image_nodes.append(DocNode(
                        text=image_text,
                        metadata=metadata,
                        global_metadata=extra_info
                    ))
                except Exception as e:
                    LOG.error(f'[Docx Reader] Failed to extract image: {e}')

        return image_nodes

    def _extract_math_from_element(self, element) -> Optional[str]:
        math_nodes = element.xpath('.//*[local-name()="oMath"] | .//*[local-name()="oMathPara"]')
        if not math_nodes:
            return None

        try:
            math = math_nodes[0]
            return self._math_to_text(math)
        except Exception as e:
            LOG.error(f'[Docx Reader] Failed to extract math: {e}')
            return None

    def _math_to_text(self, math_node) -> str:
        try:
            def text_generator(node):
                if node.text:
                    yield node.text

                for child in node:
                    yield from text_generator(child)

                if node.tail:
                    yield node.tail

            return ''.join(text_generator(math_node))
        except Exception as e:
            LOG.error(f'[Docx Reader] Failed to convert math to text: {e}')
            return ''

    def _table_to_markdown(self, table: 'docx.table.Table') -> str:
        if not table.rows:
            return '\n[empty table]\n'

        try:
            col_size = len(table.rows[0].cells)
            md_lines = []

            for row_idx, row in enumerate(table.rows):
                cells = []
                for i in range(min(col_size, len(row.cells))):
                    cell = row.cells[i]
                    text = getattr(cell, 'text', '') or ''.join(
                        p.text for p in getattr(cell, 'paragraphs', [])
                    )
                    cells.append(text.replace('\n', ' ').replace('\r', ' ').strip())

                cells.extend([''] * (col_size - len(cells)))

                md_lines.append('| ' + ' | '.join(cells) + ' |')

                if row_idx == 0:
                    md_lines.append('|' + '|'.join([' --- '] * col_size) + '|')

            return '\n' + '\n'.join(md_lines) + '\n'

        except Exception:
            return '\n[Table parse failed]\n'

    def _process_table(self, table: 'docx.table.Table', base_metadata: dict,
                       extra_info: Optional[Dict] = None) -> DocNode:
        metadata = copy.deepcopy(base_metadata)
        metadata['type'] = 'table'

        table_md = self._table_to_markdown(table)
        table_text = table_md

        return DocNode(text=table_text, metadata=metadata, global_metadata=extra_info)

    def _check_run_bold(self, para) -> bool:
        try:
            runs = [run for run in para.runs if run.text.strip()]
            return bool(runs and all(run.font.bold for run in runs))
        except Exception:
            return False

    def _process_paragraph(self, para, content: str, base_metadata: dict,
                           extra_info: Optional[Dict]) -> DocNode:
        metadata = copy.deepcopy(base_metadata)
        style_dict = self._get_style_info(para.style)

        if not style_dict.get('font_bold') and self._check_run_bold(para):
            style_dict['font_bold'] = True

        aligned_type = self._get_aligned_type(para)
        style_dict.update({'aligned_type': aligned_type})

        number_title_pattern = r'^(\d{1,2}(?:\.\d{1,2})+)\s*([^\d].*)$'
        if style_dict.get('font_bold') and re.match(number_title_pattern, content):
            content = f'**{content}**'

        metadata['style_dict'] = style_dict
        metadata['type'] = 'text'

        return DocNode(text=content, metadata=metadata, global_metadata=extra_info)

    def _process_math(self, math_text: str, base_metadata: dict,
                      extra_info: Optional[Dict]) -> DocNode:
        metadata = copy.deepcopy(base_metadata)
        metadata['type'] = 'equation'
        return DocNode(text=math_text, metadata=metadata, global_metadata=extra_info)

`lazyllm.tools.rag.readers.EpubReader`

Bases: LazyLLMReaderBase

A file reader for .epub format eBooks.

Inherits from LazyLLMReaderBase, and only needs to implement _load_data. The Document module can automatically use this class to load .epub files.

Note: Reading from fsspec file systems (e.g., remote paths) is not supported in this version. If fs is specified, it will fall back to reading from the local file system.

Returns:

List[DocNode]: A single node containing all merged chapter content from the EPUB file.

Source code in lazyllm/tools/rag/readers/epubReader.py

class EpubReader(LazyLLMReaderBase):
    """A file reader for `.epub` format eBooks.

Inherits from `LazyLLMReaderBase`, and only needs to implement `_load_data`. The `Document` module can automatically use this class to load `.epub` files.

Note: Reading from fsspec file systems (e.g., remote paths) is not supported in this version. If `fs` is specified, it will fall back to reading from the local file system.

**Returns:**

- List[DocNode]: A single node containing all merged chapter content from the EPUB file.
"""
    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        if fs:
            LOG.warning('fs was specified but EpubReader doesn\'t support loading from '
                        'fsspec filesystems. Will load from local filesystem instead.')

        text_list = []

        spec = importlib.util.find_spec('ebooklib.epub')
        if spec is None:
            raise ImportError(
                'Please install ebooklib to use ebooklib module. '
                'You can install it with `pip install ebooklib`'
            )
        epub_module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(epub_module)

        book = epub_module.read_epub(file, options={'ignore_ncs': True})

        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                text_list.append(html2text.html2text(item.get_content().decode('utf-8')))
        text = '\n'.join(text_list)
        return [DocNode(text=text)]

`lazyllm.tools.rag.readers.HWPReader`

Bases: LazyLLMReaderBase

A HWP format file parser. It supports loading from the local filesystem. It extracts body text from the .hwp file and returns it as a list of DocNode objects.

HWP is a proprietary binary document format used primarily in Korea. This reader focuses on extracting the plain text from the body sections of the document.

Parameters:

return_trace (bool, default: True ) –

Whether to enable trace logging. Defaults to True.

Source code in lazyllm/tools/rag/readers/hwpReader.py

class HWPReader(LazyLLMReaderBase):
    """
A HWP format file parser. It supports loading from the local filesystem. It extracts body text from the `.hwp` file and returns it as a list of DocNode objects.

HWP is a proprietary binary document format used primarily in Korea. This reader focuses on extracting the plain text from the body sections of the document.

Args:
    return_trace (bool): Whether to enable trace logging. Defaults to ``True``.
"""
    def __init__(self, return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._FILE_HEADER_SECTION = 'FileHeader'
        self._HWP_SUMMARY_SECTION = '\x05HwpSummaryInformation'
        self._SECTION_NAME_LENGTH = len('Section')
        self._BODYTEXT_SECTION = 'BodyText'
        self._HWP_TEXT_TAGS = [67]
        self._text = ''

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if fs:
            LOG.warning('fs was specified but HWPReader doesn\'t support loading from '
                        'fsspec filesystems. Will load from local filesystem instead.')

        if not isinstance(file, Path): file = Path(file)

        load_file = olefile.OleFileIO(file)
        file_dir = load_file.listdir()
        if self._is_valid(file_dir) is False: raise Exception('Not Valid HwpFile')

        result_text = self._get_text(load_file, file_dir)
        return [DocNode(text=result_text)]

    def _is_valid(self, dirs: List[str]) -> bool:
        if [self._FILE_HEADER_SECTION] not in dirs: return False
        return [self._HWP_SUMMARY_SECTION] in dirs

    def _get_text(self, load_file: Any, file_dirs: List[str]) -> str:
        sections = self._get_body_sections(file_dirs)
        text = ''
        for section in sections:
            text += self._get_text_from_section(load_file, section)
            text += '\n'

        self._text = text
        return self._text

    def _get_body_sections(self, dirs: List[str]) -> List[str]:
        m = []
        for d in dirs:
            if d[0] == self._BODYTEXT_SECTION:
                m.append(int(d[1][self._SECTION_NAME_LENGTH:]))

        return ['BodyText/Section' + str(x) for x in sorted(m)]

    def _is_compressed(self, load_file: Any) -> bool:
        header = load_file.openstream('FileHeader')
        header_data = header.read()
        return (header_data[36] & 1) == 1

    def _get_text_from_section(self, load_file: Any, section: str) -> str:
        bodytext = load_file.openstream(section)
        data = bodytext.read()

        unpacked_data = (zlib.decompress(data, -15) if self._is_compressed(load_file) else data)
        size = len(unpacked_data)

        i = 0
        text = ''
        while i < size:
            header = struct.unpack_from('<I', unpacked_data, i)[0]
            rec_type = header & 0x3FF
            (header >> 10) & 0x3FF
            rec_len = (header >> 20) & 0xFFF

            if rec_type in self._HWP_TEXT_TAGS:
                rec_data = unpacked_data[i + 4: i + 4 + rec_len]
                text += rec_data.decode('utf-16')
                text += '\n'

            i += 4 + rec_len
        return text

`lazyllm.tools.rag.readers.ImageReader`

Bases: LazyLLMReaderBase

Module for reading content from image files. Supports keeping the image as base64, parsing text from images using OCR or pretrained vision models, and returns a list of nodes with text and image path.

Parameters:

parser_config (Optional[Dict], default: None ) –

Parser configuration containing the model and processor. Defaults to None. When parse_text=True and parser_config is None, relevant models will be auto-loaded based on text_type.
keep_image (bool, default: False ) –

Whether to keep the image as base64 string. Default is False.
parse_text (bool, default: False ) –

Whether to parse text from the image. Default is False.
text_type (str, default: 'text' ) –

Type of text parsing. Supports text (default) and plain_text. If plain_text, pytesseract OCR is used; otherwise a pretrained vision encoder-decoder model is used.
pytesseract_model_kwargs (Optional[Dict], default: None ) –

Optional arguments passed to pytesseract OCR. Defaults to empty dict.
return_trace (bool, default: True ) –

Whether to record the processing trace. Default is True.

Source code in lazyllm/tools/rag/readers/imageReader.py

class ImageReader(LazyLLMReaderBase):
    """Module for reading content from image files. Supports keeping the image as base64, parsing text from images using OCR or pretrained vision models, and returns a list of nodes with text and image path.

Args:
    parser_config (Optional[Dict]): Parser configuration containing the model and processor. Defaults to None. When parse_text=True and parser_config is None, relevant models will be auto-loaded based on text_type.
    keep_image (bool): Whether to keep the image as base64 string. Default is False.
    parse_text (bool): Whether to parse text from the image. Default is False.
    text_type (str): Type of text parsing. Supports ``text`` (default) and ``plain_text``. If ``plain_text``, pytesseract OCR is used; otherwise a pretrained vision encoder-decoder model is used.
    pytesseract_model_kwargs (Optional[Dict]): Optional arguments passed to pytesseract OCR. Defaults to empty dict.
    return_trace (bool): Whether to record the processing trace. Default is True.
"""
    def __init__(self, parser_config: Optional[Dict] = None, keep_image: bool = False, parse_text: bool = False,
                 text_type: str = 'text', pytesseract_model_kwargs: Optional[Dict] = None,
                 return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._text_type = text_type
        if parser_config is None and parse_text:
            if text_type == 'plain_text':
                try:
                    import pytesseract
                except ImportError:
                    raise ImportError('Please install extra dependencies that are required for the ImageReader '
                                      'when text_type is "plain_text": `pip install pytesseract`')

                processor = None
                model = pytesseract
            else:
                thirdparty.check_packages(['sentencepiece', 'torch', 'transformers'])

                processor = tf.DonutProcessor.from_pretrained('naver-clova-ix/donut-base-finetuned-cord-v2')
                model = tf.VisionEncoderDecoderModel.from_pretrained('naver-clova-ix/donut-base-finetuned-cord-v2')
            parser_config = {'processor': processor, 'model': model}

        self._parser_config = parser_config
        self._keep_image = keep_image
        self._parse_text = parse_text
        self._pytesseract_model_kwargs = pytesseract_model_kwargs or {}

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[ImageDocNode]:
        if not isinstance(file, Path): file = Path(file)

        if fs:
            with fs.open(path=file) as f:
                image = PIL.Image.open(f.read())
        else:
            image = PIL.Image.open(file)

        if image.mode != 'RGB': image = image.convert('RGB')

        image_str: Optional[str] = None  # noqa
        if self._keep_image: image_str = img_2_b64(image)  # noqa

        text_str: str = ''
        if self._parse_text:
            assert self._parser_config is not None
            model = self._parser_config['model']
            processor = self._parser_config['processor']

            if processor:
                device = infer_torch_device()
                model.to(device)

                task_prompt = '<s_cord-v2>'
                decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False,
                                                        return_tensors='pt').input_ids
                pixel_values = processor(image, return_tensors='pt').pixel_values

                output = model.generate(pixel_values.to(device), decoder_input_ids=decoder_input_ids.to(device),
                                        max_length=model.decoder.config.max_position_embeddings, early_stopping=True,
                                        pad_token_id=processor.tokenizer.pad_token_id,
                                        eos_token_id=processor.tokenizer.eos_token_id, use_cache=True, num_beams=3,
                                        bad_words_ids=[[processor.tokenizer.unk_token_id]],
                                        return_dict_in_generate=True)

                sequence = processor.batch_decode(output.sequences)[0]
                sequence = sequence.replace(processor.tokenizer.eos_token, '').replace(processor.tokenizer.pad_token, '')
                text_str = re.sub(r'<.*?>', '', sequence, count=1).strip()
            else:
                import pytesseract

                model = cast(pytesseract, self._parser_config['model'])
                text_str = model.image_to_string(image, **self._pytesseract_model_kwargs)

        return [ImageDocNode(text=text_str, image_path=str(file))]

`lazyllm.tools.rag.readers.IPYNBReader`

Bases: LazyLLMReaderBase

Module for reading and parsing Jupyter Notebook (.ipynb) files. Converts the notebook to script text, then splits it by code cells into multiple document nodes or concatenates into a single text node.

Parameters:

parser_config (Optional[Dict], default: None ) –

Reserved parser configuration parameter, currently unused. Defaults to None.
concatenate (bool, default: False ) –

Whether to concatenate all code cells into one text node. Defaults to False (split into multiple nodes).
return_trace (bool, default: True ) –

Whether to record processing trace. Default is True.

Source code in lazyllm/tools/rag/readers/ipynbReader.py

class IPYNBReader(LazyLLMReaderBase):
    """Module for reading and parsing Jupyter Notebook (.ipynb) files. Converts the notebook to script text, then splits it by code cells into multiple document nodes or concatenates into a single text node.

Args:
    parser_config (Optional[Dict]): Reserved parser configuration parameter, currently unused. Defaults to None.
    concatenate (bool): Whether to concatenate all code cells into one text node. Defaults to False (split into multiple nodes).
    return_trace (bool): Whether to record processing trace. Default is True.
"""
    def __init__(self, parser_config: Optional[Dict] = None, concatenate: bool = False, return_trace: bool = True):
        super().__init__(return_trace=return_trace)
        self._parser_config = parser_config
        self._concatenate = concatenate

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        if file.name.endswith('.ipynb'):
            try:
                import nbconvert
            except ImportError:
                raise ImportError('Please install nbconvert `pip install nbconvert`')

        if fs:
            with fs.open(file, encoding='utf-8') as f:
                doc_str = nbconvert.exporters.ScriptExporter().from_file(f)[0]
        else:
            doc_str = nbconvert.exporters.ScriptExporter().from_file(file)[0]

        splits = re.split(r'In\[\d+\]:', doc_str)
        splits.pop(0)

        if self._concatenate: docs = [DocNode(text='\n\n'.join(splits))]
        else: docs = [DocNode(text=s) for s in splits]

        return docs

`lazyllm.tools.rag.readers.MineruPDFReader`

Bases: _OcrReaderBase

Reader for PDF files by calling the Mineru service's API.

Parameters:

url (str, default: None ) –

The complete API endpoint URL for the Mineru service.
backend (str, default: None ) –

Type of parsing engine. Available options: - 'pipeline': Standard processing pipeline - 'vlm-transformers': Vision-language model based on Transformers - 'vlm-vllm-async-engine': Vision-language model based on async VLLM engine Defaults to 'pipeline'.
extract_table (bool, default: True ) –

Whether to extract table content and convert to Markdown format. Defaults to True.
extract_formula (bool, default: True ) –

Whether to extract formula text. - True: Extract as text format (e.g., LaTeX) - False: Keep formulas as images Defaults to True.
split_doc (bool, default: True ) –

If True (default), parses into a RichDocNode which can be used with RichTransform to extract nodes with structural information; if False, parses into a plain text DocNode.
clean_content (bool, default: True ) –

Whether to clean redundant content (headers, footers, page numbers, etc.). Defaults to True.
post_func (Optional[Callable[[List[DocNode]], Any]], default: None ) –

Post-processing function that takes a list of DocNodes as input for custom result handling. Defaults to None.
api_key (str, default: None ) –

Static auth token used at initialization.
dynamic_auth (bool, default: False ) –

Whether to enable dynamic auth. When enabled, token is read from globals.config['dynamic_ocr_auth']['mineru'].
auth_strategy (AuthStrategy, default: None ) –

Custom auth injection strategy. Defaults to Bearer token strategy.

Notes

When split_doc=True, returns a RichDocNode; otherwise returns a DocNode. Both cases return a single node. When split_doc=True, it is strongly recommended to use it with RichTransform, which can extract nodes with structural information and other metadata; without RichTransform, the parsed nodes will fall back to plain text nodes. Per-request token: inject via inject_ocr_config({'ocr_auth': {'mineru': '...'}}) into globals.config['dynamic_ocr_auth']; CredentialMixin reads it on each HTTP call. Static default token: globals['config']['mineru_api_key'] (only when dynamic_auth=False).

Examples:

from lazyllm.tools.rag.readers import MineruPDFReader reader = MineruPDFReader("http://0.0.0.0:8888") # Mineru server address nodes = reader("path/to/pdf")

Source code in lazyllm/tools/rag/readers/ocrReader/mineru_pdf_reader.py

class MineruPDFReader(_OcrReaderBase):
    """Reader for PDF files by calling the Mineru service's API.

Args:
    url (str): The complete API endpoint URL for the Mineru service.
    backend (str, optional): Type of parsing engine. Available options:
        - 'pipeline': Standard processing pipeline
        - 'vlm-transformers': Vision-language model based on Transformers
        - 'vlm-vllm-async-engine': Vision-language model based on async VLLM engine
        Defaults to 'pipeline'.
    extract_table (bool, optional): Whether to extract table content and convert
        to Markdown format. Defaults to True.
    extract_formula (bool, optional): Whether to extract formula text.
        - True: Extract as text format (e.g., LaTeX)
        - False: Keep formulas as images
        Defaults to True.
    split_doc (bool, optional): If True (default), parses into a `RichDocNode` which can be used with `RichTransform` to extract nodes with structural information;
        if False, parses into a plain text `DocNode`.
    clean_content (bool, optional): Whether to clean redundant content
        (headers, footers, page numbers, etc.). Defaults to True.
    post_func (Optional[Callable[[List[DocNode]], Any]], optional): Post-processing
        function that takes a list of DocNodes as input for custom result handling.
        Defaults to None.
    api_key (str, optional): Static auth token used at initialization.
    dynamic_auth (bool, optional): Whether to enable dynamic auth. When enabled,
        token is read from globals.config['dynamic_ocr_auth']['mineru'].
    auth_strategy (AuthStrategy, optional): Custom auth injection strategy.
        Defaults to Bearer token strategy.

Notes:
    When `split_doc=True`, returns a `RichDocNode`; otherwise returns a `DocNode`. Both cases return a single node.
    When `split_doc=True`, it is strongly recommended to use it with `RichTransform`, which can extract nodes with structural information and other metadata;
    without `RichTransform`, the parsed nodes will fall back to plain text nodes.
    Per-request token: inject via ``inject_ocr_config({'ocr_auth': {'mineru': '...'}})`` into
    ``globals.config['dynamic_ocr_auth']``; CredentialMixin reads it on each HTTP call.
    Static default token: ``globals['config']['mineru_api_key']`` (only when ``dynamic_auth=False``).


Examples:
    from lazyllm.tools.rag.readers import MineruPDFReader
    reader = MineruPDFReader("http://0.0.0.0:8888")  # Mineru server address
    nodes = reader("path/to/pdf")
    """
    def __init__(self,
                 url: Optional[str] = None,
                 backend: Optional[str] = None,
                 upload_mode: Optional[bool] = None,
                 extract_table: bool = True,
                 extract_formula: bool = True,
                 split_doc: bool = True,
                 clean_content: bool = True,
                 timeout: Optional[int] = None,
                 post_func: Optional[Callable] = None,
                 return_trace: bool = True,
                 dropped_types: Optional[Set[str]] = None,
                 api_key: Optional[str] = None,
                 dynamic_auth: bool = False,
                 auth_strategy: Optional[AuthStrategy] = None,
                 **kwargs):
        if dynamic_auth:
            token = None
        else:
            token = api_key if api_key is not None else lazyllm.config['mineru_api_key']
        super().__init__(url=url or default_online_url('mineru'),
                         dropped_types=dropped_types or {
                             'header', 'footer', 'page_number', 'aside_text', 'page_footnote'},
                         return_trace=return_trace,
                         post_func=post_func,
                         image_cache_dir=kwargs.pop('image_cache_dir', os.path.join(
                             lazyllm.config['home'], 'mineru_cache')),
                         token=token,
                         dynamic_auth=dynamic_auth,
                         auth_strategy=auth_strategy,
                         **kwargs)
        self._backend = backend or lazyllm.config['mineru_backend']
        self._timeout = timeout if (timeout is not None and timeout > 0) else None
        self._variant = resolve_ocr_variant('mineru', self._url)
        self._offline_mode = self._variant == OcrServiceVariant.OFFLINE
        self._upload_mode = upload_mode if upload_mode is not None else self._offline_mode

    @override
    def _load_data(self, file, extra_info: Optional[Dict] = None, use_cache: bool = True
                   ) -> List['DocNode']:
        file_path = Path(file)
        merged_info = dict(extra_info) if extra_info else {}
        _t0 = time.time()
        if self._offline_mode:
            response_text = self._fetch_sync(file_path, use_cache)
            task_dir = self._image_cache_dir / str(uuid.uuid4())
            task_dir.mkdir(parents=True, exist_ok=True)
            self._download_offline_images(response_text, cache_dir=task_dir)
        else:
            response_text, task_dir = self._fetch_async(file_path, use_cache)
        _t_fetch = time.time() - _t0
        if task_dir is not None:
            merged_info['image_cache_dir'] = str(task_dir)
        _t1 = time.time()
        nodes = self._build_nodes_from_response(response_text, file_path, merged_info)
        _t_build = time.time() - _t1
        LOG.info(f'[BENCHMARK] file={file_path.name} phase=fetch elapsed={_t_fetch:.3f}s')
        LOG.info(f'[BENCHMARK] file={file_path.name} phase=parse elapsed={_t_build:.3f}s')
        return nodes

    def _fetch_sync(self, file: Path, use_cache: bool) -> str:
        payload = {
            'return_content_list': 'true',
            'use_cache': 'false' if not use_cache else 'true',
            'backend': self._backend,
            'table_enable': 'true',
            'formula_enable': 'true',
        }
        if not self._upload_mode:
            payload['files'] = str(file)
            response = post_sync(self._url, payload=payload, timeout=self._timeout)
        else:
            with open(file, 'rb') as f:
                files = {'upload_files': (os.path.basename(file), f)}
                response = post_sync(self._url, payload=payload, files=files, timeout=self._timeout)
        return response.text

    @staticmethod
    def _parse_service_endpoint(url: str) -> tuple:
        raw = (url or '').strip().rstrip('/')
        if not raw:
            raise ValueError('[MineruPDFReader] url is required for offline image download')
        # Scheme-less host:port (e.g. localhost:8000) defaults to http; use https:// explicitly for TLS.
        if '://' not in raw:
            raw = f'http://{raw}'
        parsed = urlparse(raw)
        scheme = parsed.scheme or 'http'
        host = parsed.hostname
        port = parsed.port
        if not host and parsed.path:
            reparsed = urlparse(f'{scheme}://{parsed.path.lstrip("/")}')
            host = reparsed.hostname
            port = reparsed.port
        if not host:
            raise ValueError(f'[MineruPDFReader] cannot parse host from url: {url!r}')
        return scheme, host, port

    def _image_base_url(self) -> str:
        scheme, host, port = self._parse_service_endpoint(self._url)
        if port:
            return f'{scheme}://{host}:{port}'
        return f'{scheme}://{host}'

    @staticmethod
    def _normalize_image_rel_path(path: str) -> str:
        if not path:
            return ''
        path = str(path).replace('\\', '/').strip()
        match = _IMAGE_REF_PATTERN.search(path)
        if match:
            return match.group(0)
        if path.startswith('images/'):
            return path.split('?')[0]
        name = Path(path).name
        if name and '.' in name:
            return f'images/{name}'
        return ''

    def _offline_content_list(self, raw) -> List[dict]:
        if isinstance(raw, dict):
            return raw.get('result', [{}])[0].get('content_list', []) or []
        if isinstance(raw, list):
            return raw
        return []

    @staticmethod
    def _add_image_paths_from_mapping(mapping: dict, rel_paths: Set[str]) -> None:
        for key in ('img_path', 'image_path', 'image_url'):
            normalized = MineruPDFReader._normalize_image_rel_path(mapping.get(key, ''))
            if normalized:
                rel_paths.add(normalized)

    def _add_image_paths_from_item(self, item: dict, rel_paths: Set[str]) -> None:
        self._add_image_paths_from_mapping(item, rel_paths)
        for line in item.get('lines') or []:
            if isinstance(line, dict):
                self._add_image_paths_from_mapping(line, rel_paths)

    def _collect_offline_image_paths(self, response_text: str) -> Set[str]:
        rel_paths = set(_IMAGE_REF_PATTERN.findall(response_text))
        try:
            raw = json.loads(response_text)
        except json.JSONDecodeError:
            return rel_paths
        for item in self._offline_content_list(raw):
            if isinstance(item, dict):
                self._add_image_paths_from_item(item, rel_paths)
        return rel_paths

    @staticmethod
    def _resolve_path_under_base(base_dir: Path, rel_path: str) -> Optional[Path]:
        try:
            base = os.path.realpath(base_dir)
            joined = os.path.realpath(base_dir / rel_path)
        except OSError:
            return None
        if joined == base or joined.startswith(base + os.sep):
            return Path(joined)
        return None

    def _copy_cached_offline_image(self, rel_path: str, save_path: Path) -> bool:
        image_path = self._resolve_path_under_base(self._image_cache_dir, rel_path)
        if image_path is None:
            return False
        try:
            if not image_path.is_file() or image_path.stat().st_size <= 0:
                return False
            save_path.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(image_path, save_path)
            return True
        except OSError as exc:
            LOG.warning(
                f'[MineruPDFReader] failed to copy cached image {image_path} -> {save_path}: {exc}'
            )
            return False

    def _download_offline_images(self, response_text: str, cache_dir: Path) -> None:
        if not cache_dir or not response_text:
            return
        rel_paths = self._collect_offline_image_paths(response_text)
        if not rel_paths:
            LOG.warning('[MineruPDFReader] no image paths found in offline OCR response')
            return
        base_url = self._image_base_url().rstrip('/')
        image_tasks = []
        for rel_path in sorted(rel_paths):
            save_path = self._resolve_path_under_base(cache_dir, rel_path)
            if save_path is None:
                LOG.warning(
                    f'[MineruPDFReader] path traversal detected in image path: {rel_path}'
                )
                continue
            try:
                if save_path.is_file() and save_path.stat().st_size > 0:
                    continue
            except OSError:
                pass
            if self._copy_cached_offline_image(rel_path, save_path):
                continue
            save_path.parent.mkdir(parents=True, exist_ok=True)
            image_tasks.append((f'{base_url}/{rel_path}', save_path))
        if not image_tasks:
            return
        LOG.info(
            f'[MineruPDFReader] downloading {len(image_tasks)} offline images '
            f'to {cache_dir} from {base_url}'
        )
        self._download_images(image_tasks)

    @staticmethod
    def _download_images(image_tasks: List[tuple]) -> None:
        def _download_one(task: tuple) -> None:
            img_url, save_path = task
            try:
                resp = requests.get(img_url, timeout=120)
                resp.raise_for_status()
                save_path.write_bytes(resp.content)
            except Exception as exc:
                LOG.warning(
                    f'[MineruPDFReader] failed to download image {img_url} -> {save_path}: {exc}'
                )

        with ThreadPoolExecutor(max_workers=8) as executor:
            list(executor.map(_download_one, image_tasks))

    def _fetch_async(self, file, use_cache: bool = True):
        file_str = str(file)
        splits = self._split_large_pdf(file_str)
        task_dir = self._image_cache_dir / str(uuid.uuid4())

        if len(splits) == 1:
            return retry_transient(
                self._fetch_async_by_upload,
                log_prefix=f'[MineruPDFReader] {os.path.basename(file_str)} ')(
                    splits[0][0], task_dir=task_dir)

        results = {}
        with ThreadPoolExecutor(max_workers=min(len(splits), 5)) as executor:
            futures = {
                executor.submit(
                    retry_transient(
                        self._fetch_async_by_upload,
                        log_prefix=f'[MineruPDFReader] {os.path.basename(sub_path)} '),
                    sub_path, task_dir=task_dir,
                ): start_page
                for sub_path, start_page in splits
            }
            for future in as_completed(futures):
                start_page = futures[future]
                results[start_page] = future.result()

        return self._merge_split_results(results)

    def _merge_split_results(self, results: dict):
        sorted_pages = sorted(results.keys())
        all_content = []
        first_task_dir = None

        for start_page in sorted_pages:
            json_str, task_dir = results[start_page]
            if first_task_dir is None:
                first_task_dir = task_dir
            content = json.loads(json_str)
            items = content

            for item in items:
                if 'page_idx' in item:
                    item['page_idx'] += start_page
                all_content.append(item)

        merged_json = json.dumps(all_content)

        return merged_json, first_task_dir

    def _fetch_async_by_upload(self, file_path: str, task_dir: Optional['Path'] = None):
        """Upload a local file via batch presigned URL and fetch result."""
        fname = os.path.basename(file_path)

        # Step 1: Request presigned upload URL
        payload = {
            'files': [{'name': fname}],
            'model_version': 'vlm',
        }
        resp = self._request(
            'POST',
            'https://mineru.net/api/v4/file-urls/batch',
            json=payload,
            headers={'Content-Type': 'application/json'},
            timeout=self._timeout,
        )
        data = resp.json()
        batch_id = data['data']['batch_id']
        file_url = data['data']['file_urls'][0]
        auth_key = self._auth_key_after_successful_request()

        # Step 2: Upload file to OSS
        _t2 = time.time()
        with open(file_path, 'rb') as f:
            upload_resp = requests.put(file_url, data=f, timeout=self._timeout or 300)
            upload_resp.raise_for_status()
        _t_upload = time.time() - _t2
        LOG.info(f'[BENCHMARK] file={fname} phase=upload elapsed={_t_upload:.3f}s')

        # Step 3: Poll batch results
        _t3 = time.time()
        status_url = f'https://mineru.net/api/v4/extract-results/batch/{batch_id}'
        for _ in range(120):
            status_resp = self._request_with_pinned_auth(
                'GET',
                status_url,
                auth_key,
                timeout=self._timeout or 30,
            )
            status_data = status_resp.json()
            extract_result = status_data.get('data', {}).get('extract_result', [])
            if extract_result:
                state = extract_result[0].get('state')
                if state == 'done':
                    full_zip_url = extract_result[0].get('full_zip_url')
                    zip_resp = requests.get(full_zip_url, timeout=self._timeout or 120)
                    zip_resp.raise_for_status()
                    _t_wait = time.time() - _t3
                    LOG.info(f'[BENCHMARK] file={fname} phase=wait elapsed={_t_wait:.3f}s')
                    return self._extract_content_from_zip(zip_resp.content, task_dir=task_dir)
                elif state == 'failed':
                    raise RuntimeError(
                        f'[MineruPDFReader] Batch task failed: '
                        f'{extract_result[0].get("err_msg", "Unknown error")}')
            time.sleep(3)

        raise TimeoutError('[MineruPDFReader] Batch polling timed out')

    def _extract_content_from_zip(self, zip_bytes: bytes, task_dir: Optional['Path'] = None):
        if task_dir is None:
            task_dir = self._image_cache_dir / str(uuid.uuid4())
        task_dir.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
            for member in zf.infolist():
                member_path = Path(member.filename)
                if member_path.is_absolute() or '..' in member_path.parts:
                    raise ValueError(f'Path traversal detected in zip: {member.filename}')
            json_members = [m for m in zf.infolist() if m.filename.endswith('_content_list.json')]
            if not json_members:
                raise ValueError('No *_content_list.json found in zip')
            content = json.loads(zf.read(json_members[0]))
            for member in zf.infolist():
                if not member.filename.endswith('_content_list.json'):
                    zf.extract(member, task_dir)
        return json.dumps(content), task_dir

    @override
    def _adapt_json_to_IR(self, raw) -> List[Block]:
        # Online API (zip extraction) returns a list directly.
        # Local server returns {'result': [{'content_list': [...]}]}.
        if isinstance(raw, dict):
            content_list = raw['result'][0]['content_list']
        else:
            content_list = raw

        blocks: List[Block] = []
        for item in content_list:
            block = self._adapt_one(item)
            if block is not None:
                if self._offline_mode and 'lines' in item:
                    block.lines = self._normalize_content(item['lines'])
                blocks.append(block)
        return blocks

    def _normalize_content(self, content) -> List:
        if isinstance(content, str):
            return [content.encode('utf-8', 'replace').decode('utf-8')]
        elif isinstance(content, list):
            result = []
            for item in content:
                if isinstance(item, str):
                    result.append(item.encode('utf-8', 'replace').decode('utf-8'))
                elif isinstance(item, dict):
                    normalized = dict(item)
                    if 'content' in normalized and isinstance(normalized['content'], str):
                        normalized['content'] = normalized['content'].encode(
                            'utf-8', 'replace').decode('utf-8')
                    result.append(normalized)
                else:
                    result.append(item)
            return result
        raise TypeError(f'Not supported type: {type(content)}.')

    def _adapt_one(self, item: dict) -> Optional[Block]:
        ty = item.get('type')
        if ty is None:
            LOG.warning(f'[MineruPDFReader] content item missing type field, skipped: {item}')
            return None
        if ty in self._dropped_types:
            return None

        text_level = item.get('text_level', -1)
        text = item.get('text', '')
        page_idx = item.get('page_idx')
        if page_idx is None:
            LOG.warning(f'[MineruPDFReader] content item missing page_idx field, skipped: {item}')
            return None
        bbox = item.get('bbox')
        if bbox is None:
            LOG.warning(f'[MineruPDFReader] content item missing bbox field, skipped: {item}')
            return None
        page = PageRef(index=page_idx, bbox=BBox.from_list(bbox))

        if ty == 'title':
            return HeadingBlock(page=page, level=text_level, text=text)
        elif ty in ('text', 'ref_text', 'phonetic'):
            return ParagraphBlock(page=page, text=text)
        elif ty == 'image':
            return self._adapt_image(item, page, page_idx)
        elif ty == 'table':
            return self._adapt_table(item, page, page_idx)
        elif ty == 'equation':
            return FormulaBlock(page=page, latex=text, inline=False)
        elif ty == 'code':
            return self._adapt_code(item, page, page_idx)
        elif ty == 'list':
            return self._adapt_list(item, page, page_idx)
        return None

    def _adapt_image(self, item: dict, page: PageRef, page_idx: int) -> Optional[Block]:
        img_path = item.get('img_path')
        if img_path is None:
            LOG.warning(f'[MineruPDFReader] image block on page {page_idx} missing img_path, skipped')
            return None
        return FigureBlock(
            page=page,
            image_path=Path(img_path),
            caption=self._first(item.get('image_caption')),
            footnote=self._first(item.get('image_footnote')),
        )

    def _adapt_table(self, item: dict, page: PageRef, page_idx: int) -> TableBlock:
        table_body = item.get('table_body')
        if table_body is None:
            LOG.warning(f'[MineruPDFReader] table block on page {page_idx} missing table_body, '
                        f'caption={self._first(item.get("table_caption"))}')
        return TableBlock(
            page=page,
            caption=self._first(item.get('table_caption')),
            footnote=self._first(item.get('table_footnote')),
            cells=self._parse_table_html(table_body or ''),
            page_range=(page_idx, page_idx),
        )

    def _adapt_code(self, item: dict, page: PageRef, page_idx: int) -> Optional[Block]:
        code_body = item.get('code_body')
        if code_body is None:
            LOG.warning(f'[MineruPDFReader] code block on page {page_idx} missing code_body, skipped')
            return None
        return CodeBlock(
            page=page, text=code_body,
            language=item.get('guess_lang'),
            caption=self._first(item.get('code_caption')),
        )

    def _adapt_list(self, item: dict, page: PageRef, page_idx: int) -> Optional[Block]:
        list_items = item.get('list_items')
        if list_items is None:
            LOG.warning(f'[MineruPDFReader] list block on page {page_idx} missing list_items, skipped')
            return None
        return ListBlock(page=page, items=list_items, ordered=False)

    @override
    def _build_nodes_from_blocks(self, blocks: List[Block], file,
                                 extra_info: Optional[Dict] = None) -> List[DocNode]:
        docs = []

        global_metadata = dict(extra_info) if extra_info else {}
        # image_cache_dir is injected into extra_info by _load_data for async requests
        if 'image_cache_dir' not in global_metadata:
            global_metadata['image_cache_dir'] = str(self._image_cache_dir)

        file_name = Path(file).name if not isinstance(file, str) else Path(file).name
        file_path = str(file)

        for b in blocks:
            text = b.text_content()
            metadata = {
                'file_name': file_name,
                'file_path': file_path,
                'type': b.ty,
                'page': b.page.index,
                'bbox': b.page.bbox.to_list(),
                'section_path': b.section.anchors,
            }
            b.update_metadata(metadata)
            if b.lines:
                metadata['lines'] = b.lines
            node = DocNode(text=text, metadata=metadata, global_metadata=global_metadata)
            node.excluded_embed_metadata_keys = [k for k in metadata if k not in ('file_name', 'text')]
            node.excluded_llm_metadata_keys = [k for k in metadata if k not in ('file_name', 'text')]
            docs.append(node)

        return docs

`lazyllm.tools.rag.readers.MarkdownReader`

Bases: LazyLLMReaderBase

Module for reading and parsing Markdown files. Supports removing hyperlinks and images, and splits Markdown into text segments by headers, returning document nodes.

Parameters:

remove_hyperlinks (bool, default: True ) –

Whether to remove hyperlinks, default is True.
remove_images (bool, default: True ) –

Whether to remove image tags, default is True.
return_trace (bool, default: True ) –

Whether to record processing trace, default is True.

Source code in lazyllm/tools/rag/readers/markdownReader.py

class MarkdownReader(LazyLLMReaderBase):
    """Module for reading and parsing Markdown files. Supports removing hyperlinks and images, and splits Markdown into text segments by headers, returning document nodes.

Args:
    remove_hyperlinks (bool): Whether to remove hyperlinks, default is True.
    remove_images (bool): Whether to remove image tags, default is True.
    return_trace (bool): Whether to record processing trace, default is True.
"""
    def __init__(self, remove_hyperlinks: bool = True, remove_images: bool = True, return_trace: bool = True) -> None:
        super().__init__(return_trace=return_trace)
        self._remove_hyperlinks = remove_hyperlinks
        self._remove_images = remove_images

    def _markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        markdown_tups: List[Tuple[Optional[str], str]] = []
        lines = markdown_text.split('\n')

        current_header = None
        current_lines = []
        in_code_block = False

        for line in lines:
            if line.startswith('```'): in_code_block = not in_code_block

            header_match = re.match(r'^#+\s', line)
            if not in_code_block and header_match:
                if current_header is not None or len(current_lines) > 0:
                    markdown_tups.append((current_header, '\n'.join(current_lines)))
                current_header = line
                current_lines.clear()
            else:
                current_lines.append(line)

        markdown_tups.append((current_header, '\n'.join(current_lines)))
        return [(key.strip() if key is not None else None, re.sub(r'<.*?>', '', value))
                for key, value in markdown_tups]

    def remove_images(self, content: str) -> str:
        """Remove custom image tags of the form ![[...]] from the content.

Args:
    content (str): Input markdown content.

**Returns:**

- str: Content with image tags removed.
"""
        pattern = r'!{1}\[\[(.*)\]\]'
        return re.sub(pattern, '', content)

    def remove_hyperlinks(self, content: str) -> str:
        """Remove markdown hyperlinks, converting [text](url) to just text.

Args:
    content (str): Input markdown content.

**Returns:**

- str: Content with hyperlinks removed, only link text retained.
"""
        pattern = r'\[(.*)\]\((.*)\)'
        return re.sub(pattern, r'\1', content)

    def _parse_tups(self, filepath: Path, errors: str = 'ignore',
                    fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[Tuple[Optional[str], str]]:
        fs = fs or fsspec.implementations.local.LocalFileSystem()

        with fs.open(filepath, encoding='utf-8') as f:
            content = f.read().decode(encoding='utf-8')

        if self._remove_hyperlinks: content = self.remove_hyperlinks(content)
        if self._remove_images: content = self.remove_images(content)
        return self._markdown_to_tups(content)

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        if not isinstance(file, Path): file = Path(file)

        tups = self._parse_tups(file, fs=fs)
        results = [DocNode(
            content=[value if header is None else f'\n\n{header}\n{value}' for header, value in tups])]
        return results

`remove_hyperlinks(content)`

Remove markdown hyperlinks, converting text to just text.

Parameters:

content (str) –

Input markdown content.

Returns:

str: Content with hyperlinks removed, only link text retained.

Source code in lazyllm/tools/rag/readers/markdownReader.py

    def remove_hyperlinks(self, content: str) -> str:
        """Remove markdown hyperlinks, converting [text](url) to just text.

Args:
    content (str): Input markdown content.

**Returns:**

- str: Content with hyperlinks removed, only link text retained.
"""
        pattern = r'\[(.*)\]\((.*)\)'
        return re.sub(pattern, r'\1', content)

`remove_images(content)`

Remove custom image tags of the form ![[...]] from the content.

Parameters:

content (str) –

Input markdown content.

Returns:

str: Content with image tags removed.

Source code in lazyllm/tools/rag/readers/markdownReader.py

    def remove_images(self, content: str) -> str:
        """Remove custom image tags of the form ![[...]] from the content.

Args:
    content (str): Input markdown content.

**Returns:**

- str: Content with image tags removed.
"""
        pattern = r'!{1}\[\[(.*)\]\]'
        return re.sub(pattern, '', content)

`lazyllm.tools.rag.readers.MboxReader`

Bases: LazyLLMReaderBase

Module to parse Mbox email archive files. Reads email messages and formats them into text. Supports limiting the maximum number of messages and custom message formatting.

Parameters:

max_count (int, default: 0 ) –

Maximum number of emails to read. Default 0 means read all.
message_format (str, default: DEFAULT_MESSAGE_FORMAT ) –

Template string for formatting each message, supports placeholders {_date}, {_from}, {_to}, {_subject}, and {_content}.
return_trace (bool, default: True ) –

Whether to record processing trace. Default is True.

Source code in lazyllm/tools/rag/readers/mboxreader.py

class MboxReader(LazyLLMReaderBase):
    """Module to parse Mbox email archive files. Reads email messages and formats them into text. Supports limiting the maximum number of messages and custom message formatting.

Args:
    max_count (int): Maximum number of emails to read. Default 0 means read all.
    message_format (str): Template string for formatting each message, supports placeholders ``{_date}``, ``{_from}``, ``{_to}``, ``{_subject}``, and ``{_content}``.
    return_trace (bool): Whether to record processing trace. Default is True.
"""
    DEFAULT_MESSAGE_FORMAT: str = (
        'Date: {_date}\n'
        'From: {_from}\n'
        'To: {_to}\n'
        'Subject: {_subject}\n'
        'Content: {_content}'
    )

    def __init__(self, max_count: int = 0, message_format: str = DEFAULT_MESSAGE_FORMAT,
                 return_trace: bool = True) -> None:
        try:
            from bs4 import BeautifulSoup  # noqa
        except ImportError:
            raise ImportError('`BeautifulSoup` package not found: `pip install beautifulsoup4`')

        super().__init__(return_trace=return_trace)
        self._max_count = max_count
        self._message_format = message_format

    def _load_data(self, file: Path, fs: Optional['fsspec.AbstractFileSystem'] = None) -> List[DocNode]:
        import mailbox
        from email.parser import BytesParser
        from email.policy import default
        from bs4 import BeautifulSoup

        if fs:
            LOG.warning('fs was specified but MboxReader doesn\'t support loading from '
                        'fsspec filesystems. Will load from local filesystem instead.')

        i = 0
        results: List[str] = []
        bytes_parser = BytesParser(policy=default).parse
        mbox = mailbox.mbox(file, factory=bytes_parser)

        for _, _msg in enumerate(mbox):
            try:
                msg: mailbox.mboxMessage = _msg
                if msg.is_multipart():
                    for part in msg.walk():
                        ctype = part.get_content_type()
                        cdispo = str(part.get('Content-Disposition'))
                        if ctype == 'text/plain' and 'attachment' not in cdispo:
                            content = part.get_payload(decode=True)
                            break
                else:
                    content = msg.get_payload(decode=True)

                soup = BeautifulSoup(content)
                stripped_content = ' '.join(soup.get_text().split())
                msg_string = self._message_format.format(_date=msg['date'], _from=msg['from'], _to=msg['to'],
                                                         _subject=msg['subject'], _content=stripped_content)
                results.append(msg_string)
            except Exception as e:
                LOG.warning(f'Failed to parse message:\n{_msg}\n with exception {e}')

            i += 1
            if self._max_count > 0 and i >= self._max_count: break
        return [DocNode(text=result) for result in results]

`lazyllm.tools.SqlCall`

Bases: ModuleBase

SqlCall is a class that extends ModuleBase and provides an interface for generating and executing SQL queries using a language model (LLM). It is designed to interact with a SQL database, extract SQL queries from LLM responses, execute those queries, and return results or explanations.

Parameters:

llm –

A language model to be used for generating and interpreting SQL queries and explanations.
sql_manager (DBManager) –

Database manager instance containing connection and description information
sql_examples (str, default: '' ) –

SQL example strings for prompt engineering. Defaults to empty string
sql_post_func (Callable, default: None ) –

Function for post-processing generated SQL statements. Defaults to None
use_llm_for_sql_result (bool, default: True ) –

Whether to use LLM to explain SQL execution results. Defaults to True
return_trace (bool, default: False ) –

Whether to return execution trace information. Defaults to False

Examples:

>>> # First, run SqlManager example
>>> import lazyllm
>>> from lazyllm.tools import SQLManger, SqlCall
>>> sql_tool = SQLManger("personal.db")
>>> sql_llm = lazyllm.OnlineChatModule(model="gpt-4o", source="openai", base_url="***")
>>> sql_call = SqlCall(sql_llm, sql_tool, use_llm_for_sql_result=True)
>>> print(sql_call("去年一整年销售额最多的员工是谁?"))

Source code in lazyllm/tools/sql_call/sql_call.py

class SqlCall(ModuleBase):
    """SqlCall is a class that extends ModuleBase and provides an interface for generating and executing SQL queries using a language model (LLM).
It is designed to interact with a SQL database, extract SQL queries from LLM responses, execute those queries, and return results or explanations.

Args:
    llm: A language model to be used for generating and interpreting SQL queries and explanations.
    sql_manager (DBManager): Database manager instance containing connection and description information
    sql_examples (str, optional): SQL example strings for prompt engineering. Defaults to empty string
    sql_post_func (Callable, optional): Function for post-processing generated SQL statements. Defaults to ``None``
    use_llm_for_sql_result (bool, optional): Whether to use LLM to explain SQL execution results. Defaults to ``True``
    return_trace (bool, optional): Whether to return execution trace information. Defaults to ``False``


Examples:
        >>> # First, run SqlManager example
        >>> import lazyllm
        >>> from lazyllm.tools import SQLManger, SqlCall
        >>> sql_tool = SQLManger("personal.db")
        >>> sql_llm = lazyllm.OnlineChatModule(model="gpt-4o", source="openai", base_url="***")
        >>> sql_call = SqlCall(sql_llm, sql_tool, use_llm_for_sql_result=True)
        >>> print(sql_call("去年一整年销售额最多的员工是谁?"))
    """
    EXAMPLE_TITLE = 'Here are some example: '

    def __init__(self, llm, sql_manager: DBManager, sql_examples: str = '', sql_post_func: Callable = None,
                 use_llm_for_sql_result=True, return_trace: bool = False) -> None:
        super().__init__(return_trace=return_trace)
        if not sql_manager.desc:
            raise ValueError('Error: sql_manager found empty description.')
        self._sql_tool = sql_manager
        self.sql_post_func = sql_post_func

        if sql_manager.db_type == 'mongodb':
            self._query_prompter = ChatPrompter(instruction=mongodb_query_instruct_template).pre_hook(
                self.sql_query_promt_hook
            )
            statement_type = 'mongodb json pipeline'
            self._pattern = re.compile(r'```json(.+?)```', re.DOTALL)
        else:
            self._query_prompter = ChatPrompter(instruction=sql_query_instruct_template).pre_hook(
                self.sql_query_promt_hook
            )
            statement_type = 'sql query'
            self._pattern = re.compile(r'```sql(.+?)```', re.DOTALL)

        self._llm_query = llm.share(prompt=self._query_prompter).used_by(self._module_id)
        self._answer_prompter = ChatPrompter(
            instruction=db_explain_instruct_template.format(statement_type=statement_type, db_type=sql_manager.db_type)
        ).pre_hook(self.sql_explain_prompt_hook)
        self._llm_answer = llm.share(prompt=self._answer_prompter).used_by(self._module_id)
        self.example = sql_examples
        with pipeline() as sql_execute_ppl:
            sql_execute_ppl.exec = self._sql_tool.execute_query
            if use_llm_for_sql_result:
                sql_execute_ppl.concate = (lambda q, r: [q, r]) | bind(sql_execute_ppl.input, _0)
                sql_execute_ppl.llm_answer = self._llm_answer
        with pipeline() as ppl:
            ppl.llm_query = self._llm_query
            ppl.sql_extractor = self.extract_sql_from_response
            with switch(judge_on_full_input=False) as ppl.sw:
                ppl.sw.case[False, lambda x: x]
                ppl.sw.case[True, sql_execute_ppl]
        self._impl = ppl

    def sql_query_promt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: Optional[List[Union[List[str], Dict[str, Any]]]] = None,
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Hook to prepare the prompt inputs for generating a database query from user input.

Args:
    input (Union[str, List, Dict[str, str], None]): The user's natural language query.
    history (List[Union[List[str], Dict[str, Any]]]): Conversation history.
    tools (Union[List[Dict[str, Any]], None]): Available tool descriptions.
    label (Union[str, None]): Optional label for the prompt.

**Returns:**

- Tuple: A tuple containing the formatted prompt dict (with current_date, db_type, desc, user_query), history, tools, and label.
"""
        current_date = datetime.datetime.now().strftime('%Y-%m-%d')
        schema_desc = self._sql_tool.desc
        if self.example:
            schema_desc += f'\n{self.EXAMPLE_TITLE}\n{self.example}\n'
        if not isinstance(input, str):
            raise ValueError(f'Unexpected type for input: {type(input)}')
        return (
            dict(current_date=current_date, db_type=self._sql_tool.db_type, desc=schema_desc, user_query=input),
            history or [],
            tools,
            label,
        )

    def sql_explain_prompt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: List[Union[List[str], Dict[str, Any]]] = [],  # noqa B006
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Hook to prepare the prompt for explaining the execution result of a database query.

Args:
    input (Union[str, List, Dict[str, str], None]): A list containing the query and its result.
    history (List[Union[List[str], Dict[str, Any]]]): Conversation history.
    tools (Union[List[Dict[str, Any]], None]): Available tool descriptions.
    label (Union[str, None]): Optional label for the prompt.

**Returns:**

- Tuple: A tuple containing the formatted prompt dict (history_info, desc, query, result, explain_query), history, tools, and label.
"""
        explain_query = 'Tell the user based on the execution results, making sure to keep the language consistent \
            with the user\'s input and don\'t translate original result.'
        if not isinstance(input, list) and len(input) != 2:
            raise ValueError(f'Unexpected type for input: {type(input)}')
        assert 'root_input' in globals and self._llm_answer._module_id in globals['root_input']
        user_query = globals['root_input'][self._llm_answer._module_id]
        globals.pop('root_input')
        history_info = chat_history_to_str(history, user_query)
        return (
            dict(
                history_info=history_info,
                desc=self._sql_tool.desc,
                query=input[0],
                result=input[1],
                explain_query=explain_query,
            ),
            history,
            tools,
            label,
        )

    def extract_sql_from_response(self, str_response: str) -> tuple[bool, str]:
        """Extract SQL (or MongoDB pipeline) statement from the raw LLM response.

Args:
    str_response (str): Raw text returned by the LLM which may contain code fences.

**Returns:**

- tuple[bool, str]: A tuple where the first element indicates whether extraction succeeded, and the second is the cleaned or original content. If sql_post_func is provided, it is applied to the extracted content.
"""
        # Remove the triple backticks if present
        matches = self._pattern.findall(str_response)
        if matches:
            # Return the first match
            extracted_content = matches[0].strip()
            if self._sql_tool.db_type != 'mongodb':
                stmts = [s.strip() for s in extracted_content.split(';') if s.strip()]
                if stmts:
                    extracted_content = stmts[0]
            return True, extracted_content if not self.sql_post_func else self.sql_post_func(extracted_content)
        else:
            return False, str_response

    def forward(self, input: str, llm_chat_history: List[Dict[str, Any]] = None):
        globals['root_input'] = {self._llm_answer._module_id: input}
        if self._module_id in globals['chat_history']:
            globals['chat_history'][self._llm_query._module_id] = globals['chat_history'][self._module_id]
        return self._impl(input)

`extract_sql_from_response(str_response)`

Extract SQL (or MongoDB pipeline) statement from the raw LLM response.

Parameters:

str_response (str) –

Raw text returned by the LLM which may contain code fences.

Returns:

tuple[bool, str]: A tuple where the first element indicates whether extraction succeeded, and the second is the cleaned or original content. If sql_post_func is provided, it is applied to the extracted content.

Source code in lazyllm/tools/sql_call/sql_call.py

    def extract_sql_from_response(self, str_response: str) -> tuple[bool, str]:
        """Extract SQL (or MongoDB pipeline) statement from the raw LLM response.

Args:
    str_response (str): Raw text returned by the LLM which may contain code fences.

**Returns:**

- tuple[bool, str]: A tuple where the first element indicates whether extraction succeeded, and the second is the cleaned or original content. If sql_post_func is provided, it is applied to the extracted content.
"""
        # Remove the triple backticks if present
        matches = self._pattern.findall(str_response)
        if matches:
            # Return the first match
            extracted_content = matches[0].strip()
            if self._sql_tool.db_type != 'mongodb':
                stmts = [s.strip() for s in extracted_content.split(';') if s.strip()]
                if stmts:
                    extracted_content = stmts[0]
            return True, extracted_content if not self.sql_post_func else self.sql_post_func(extracted_content)
        else:
            return False, str_response

`sql_explain_prompt_hook(input=None, history=[], tools=None, label=None)`

Hook to prepare the prompt for explaining the execution result of a database query.

Parameters:

input (Union[str, List, Dict[str, str], None], default: None ) –

A list containing the query and its result.
history (List[Union[List[str], Dict[str, Any]]], default: [] ) –

Conversation history.
tools (Union[List[Dict[str, Any]], None], default: None ) –

Available tool descriptions.
label (Union[str, None], default: None ) –

Optional label for the prompt.

Returns:

Tuple: A tuple containing the formatted prompt dict (history_info, desc, query, result, explain_query), history, tools, and label.

Source code in lazyllm/tools/sql_call/sql_call.py

    def sql_explain_prompt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: List[Union[List[str], Dict[str, Any]]] = [],  # noqa B006
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Hook to prepare the prompt for explaining the execution result of a database query.

Args:
    input (Union[str, List, Dict[str, str], None]): A list containing the query and its result.
    history (List[Union[List[str], Dict[str, Any]]]): Conversation history.
    tools (Union[List[Dict[str, Any]], None]): Available tool descriptions.
    label (Union[str, None]): Optional label for the prompt.

**Returns:**

- Tuple: A tuple containing the formatted prompt dict (history_info, desc, query, result, explain_query), history, tools, and label.
"""
        explain_query = 'Tell the user based on the execution results, making sure to keep the language consistent \
            with the user\'s input and don\'t translate original result.'
        if not isinstance(input, list) and len(input) != 2:
            raise ValueError(f'Unexpected type for input: {type(input)}')
        assert 'root_input' in globals and self._llm_answer._module_id in globals['root_input']
        user_query = globals['root_input'][self._llm_answer._module_id]
        globals.pop('root_input')
        history_info = chat_history_to_str(history, user_query)
        return (
            dict(
                history_info=history_info,
                desc=self._sql_tool.desc,
                query=input[0],
                result=input[1],
                explain_query=explain_query,
            ),
            history,
            tools,
            label,
        )

`sql_query_promt_hook(input=None, history=None, tools=None, label=None)`

Hook to prepare the prompt inputs for generating a database query from user input.

Parameters:

input (Union[str, List, Dict[str, str], None], default: None ) –

The user's natural language query.
history (List[Union[List[str], Dict[str, Any]]], default: None ) –

Conversation history.
tools (Union[List[Dict[str, Any]], None], default: None ) –

Available tool descriptions.
label (Union[str, None], default: None ) –

Optional label for the prompt.

Returns:

Tuple: A tuple containing the formatted prompt dict (with current_date, db_type, desc, user_query), history, tools, and label.

Source code in lazyllm/tools/sql_call/sql_call.py

    def sql_query_promt_hook(
        self,
        input: Union[str, List, Dict[str, str], None] = None,
        history: Optional[List[Union[List[str], Dict[str, Any]]]] = None,
        tools: Union[List[Dict[str, Any]], None] = None,
        label: Union[str, None] = None,
    ):
        """Hook to prepare the prompt inputs for generating a database query from user input.

Args:
    input (Union[str, List, Dict[str, str], None]): The user's natural language query.
    history (List[Union[List[str], Dict[str, Any]]]): Conversation history.
    tools (Union[List[Dict[str, Any]], None]): Available tool descriptions.
    label (Union[str, None]): Optional label for the prompt.

**Returns:**

- Tuple: A tuple containing the formatted prompt dict (with current_date, db_type, desc, user_query), history, tools, and label.
"""
        current_date = datetime.datetime.now().strftime('%Y-%m-%d')
        schema_desc = self._sql_tool.desc
        if self.example:
            schema_desc += f'\n{self.EXAMPLE_TITLE}\n{self.example}\n'
        if not isinstance(input, str):
            raise ValueError(f'Unexpected type for input: {type(input)}')
        return (
            dict(current_date=current_date, db_type=self._sql_tool.db_type, desc=schema_desc, user_query=input),
            history or [],
            tools,
            label,
        )

`lazyllm.tools.rag.default_index.DefaultIndex`

Bases: IndexBase

Default index implementation responsible for querying, updating, and removing document nodes in the underlying store based on embedding or text similarity. Supports multiple similarity metrics and performs embedding computation and node updates when required.

Parameters:

embed (Dict[str, Callable]) –

Mapping of embedding names to functions that generate vector representations from strings.
store (StoreBase) –

Underlying storage to persist and retrieve DocNode objects.
**kwargs –

Reserved for future extension.

Returns:

DefaultIndex: The default index instance.

Source code in lazyllm/tools/rag/default_index.py

class DefaultIndex(IndexBase):
    """Default index implementation responsible for querying, updating, and removing document nodes in the underlying store based on embedding or text similarity.
Supports multiple similarity metrics and performs embedding computation and node updates when required.

Args:
    embed (Dict[str, Callable]): Mapping of embedding names to functions that generate vector representations from strings.
    store (StoreBase): Underlying storage to persist and retrieve `DocNode` objects.
    **kwargs: Reserved for future extension.

**Returns:**

- DefaultIndex: The default index instance.
"""
    def __init__(self, embed: Dict[str, Callable], store, **kwargs):
        self.embed = embed
        self.store = store

    @override
    def update(self, nodes: List[DocNode]) -> None:
        """Update the index with the given list of document nodes. This is a placeholder implementation and should be provided/extended in concrete usage.

Args:
    nodes (List[DocNode]): Document nodes to add or update in the index.
"""
        pass

    @override
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        """Remove nodes with specified UIDs from the index. Optionally scoped to a group. This is a no-op placeholder and should be implemented in concrete usage.

Args:
    uids (List[str]): List of unique IDs of nodes to remove.
    group_name (Optional[str]): Optional group name to scope the removal.
"""
        pass

    @override
    def query(
        self,
        query: str,
        group_name: str,
        similarity_name: str,
        similarity_cut_off: Union[float, Dict[str, float]],
        topk: int,
        embed_keys: Optional[List[str]] = None,
        filters: Optional[Dict[str, List]] = None,
        **kwargs,
    ) -> List[DocNode]:
        """Perform a query against the index, supporting both embedding-based and text-based similarity modes. Filters and ranks nodes according to similarity functions and cutoffs.

Args:
    query (str): The raw query string.
    group_name (str): The group name from which to retrieve nodes.
    similarity_name (str): Name of the similarity metric to use; must be registered in registered_similarities.
    similarity_cut_off (Union[float, Dict[str, float]]): Similarity threshold(s) used to filter results; can be a single float or a mapping per embedding.
    topk (int): Maximum number of candidates to keep per similarity channel before final filtering.
    embed_keys (Optional[List[str]]): Specific embedding keys to use; defaults to all available if not provided.
    filters (Optional[Dict[str, List]]): Additional pre-filters applied to nodes before similarity computation.
    **kwargs: Extra keyword arguments forwarded to the similarity function.

**Returns**

- list: List[DocNode]: Deduplicated list of document nodes passing similarity and cutoff criteria.
"""
        if similarity_name not in registered_similarities:
            raise ValueError(
                f'{similarity_name} not registered, please check your input. '
                f'Available options now: {registered_similarities.keys()}'
            )
        similarity_func, mode, descend = registered_similarities[similarity_name]

        nodes = self.store.get_nodes(group=group_name)
        if filters:
            nodes = generic_process_filters(nodes, filters)

        if mode == 'embedding':
            assert self.embed, 'Chosen similarity needs embed model.'
            assert len(query) > 0, 'Query should not be empty.'
            if not embed_keys:
                embed_keys = list(self.embed.keys())
            query_embedding = {k: self.embed[k](query) for k in embed_keys}
            self._check_supported(similarity_name, query_embedding)
            modified_nodes = parallel_do_embedding(self.embed, embed_keys, nodes)
            self.store.update_nodes(modified_nodes)
            similarities = similarity_func(query_embedding, nodes, topk=topk, **kwargs)
        elif mode == 'text':
            similarities = similarity_func(query, nodes, topk=topk, **kwargs)
        else:
            raise NotImplementedError(f'Mode {mode} is not supported.')

        if not isinstance(similarities, dict):
            results = self._filter_nodes_by_score(similarities, topk, similarity_cut_off, descend)
        else:
            results = []
            for key in (embed_keys or similarities.keys()):
                sims = similarities[key]
                sim_cut_off = similarity_cut_off if isinstance(similarity_cut_off, float) else similarity_cut_off[key]
                results.extend(self._filter_nodes_by_score(sims, topk, sim_cut_off, descend))
        results = list(set(results))
        LOG.debug(f'Retrieving query `{query}` and get results: {results}')
        return results

    def _filter_nodes_by_score(self, similarities: List[Tuple[DocNode, float]], topk: int,
                               similarity_cut_off: float, descend) -> List[DocNode]:
        similarities.sort(key=lambda x: x[1], reverse=descend)
        if topk is not None:
            similarities = similarities[:topk]

        return [node.with_sim_score(score) for node, score in similarities if score > similarity_cut_off]

    def _check_supported(self, similarity_name: str, query_embedding: Dict[str, Any]) -> None:
        if similarity_name.lower() == 'cosine':
            for k, e in query_embedding.items():
                if is_sparse(e):
                    raise NotImplementedError(f'embed `{k}`, which is sparse, is not supported.')

`query(query, group_name, similarity_name, similarity_cut_off, topk, embed_keys=None, filters=None, **kwargs)`

Perform a query against the index, supporting both embedding-based and text-based similarity modes. Filters and ranks nodes according to similarity functions and cutoffs.

Parameters:

query (str) –

The raw query string.
group_name (str) –

The group name from which to retrieve nodes.
similarity_name (str) –

Name of the similarity metric to use; must be registered in registered_similarities.
similarity_cut_off (Union[float, Dict[str, float]]) –

Similarity threshold(s) used to filter results; can be a single float or a mapping per embedding.
topk (int) –

Maximum number of candidates to keep per similarity channel before final filtering.
embed_keys (Optional[List[str]], default: None ) –

Specific embedding keys to use; defaults to all available if not provided.
filters (Optional[Dict[str, List]], default: None ) –

Additional pre-filters applied to nodes before similarity computation.
**kwargs –

Extra keyword arguments forwarded to the similarity function.

Returns

list: List[DocNode]: Deduplicated list of document nodes passing similarity and cutoff criteria.

Source code in lazyllm/tools/rag/default_index.py

    @override
    def query(
        self,
        query: str,
        group_name: str,
        similarity_name: str,
        similarity_cut_off: Union[float, Dict[str, float]],
        topk: int,
        embed_keys: Optional[List[str]] = None,
        filters: Optional[Dict[str, List]] = None,
        **kwargs,
    ) -> List[DocNode]:
        """Perform a query against the index, supporting both embedding-based and text-based similarity modes. Filters and ranks nodes according to similarity functions and cutoffs.

Args:
    query (str): The raw query string.
    group_name (str): The group name from which to retrieve nodes.
    similarity_name (str): Name of the similarity metric to use; must be registered in registered_similarities.
    similarity_cut_off (Union[float, Dict[str, float]]): Similarity threshold(s) used to filter results; can be a single float or a mapping per embedding.
    topk (int): Maximum number of candidates to keep per similarity channel before final filtering.
    embed_keys (Optional[List[str]]): Specific embedding keys to use; defaults to all available if not provided.
    filters (Optional[Dict[str, List]]): Additional pre-filters applied to nodes before similarity computation.
    **kwargs: Extra keyword arguments forwarded to the similarity function.

**Returns**

- list: List[DocNode]: Deduplicated list of document nodes passing similarity and cutoff criteria.
"""
        if similarity_name not in registered_similarities:
            raise ValueError(
                f'{similarity_name} not registered, please check your input. '
                f'Available options now: {registered_similarities.keys()}'
            )
        similarity_func, mode, descend = registered_similarities[similarity_name]

        nodes = self.store.get_nodes(group=group_name)
        if filters:
            nodes = generic_process_filters(nodes, filters)

        if mode == 'embedding':
            assert self.embed, 'Chosen similarity needs embed model.'
            assert len(query) > 0, 'Query should not be empty.'
            if not embed_keys:
                embed_keys = list(self.embed.keys())
            query_embedding = {k: self.embed[k](query) for k in embed_keys}
            self._check_supported(similarity_name, query_embedding)
            modified_nodes = parallel_do_embedding(self.embed, embed_keys, nodes)
            self.store.update_nodes(modified_nodes)
            similarities = similarity_func(query_embedding, nodes, topk=topk, **kwargs)
        elif mode == 'text':
            similarities = similarity_func(query, nodes, topk=topk, **kwargs)
        else:
            raise NotImplementedError(f'Mode {mode} is not supported.')

        if not isinstance(similarities, dict):
            results = self._filter_nodes_by_score(similarities, topk, similarity_cut_off, descend)
        else:
            results = []
            for key in (embed_keys or similarities.keys()):
                sims = similarities[key]
                sim_cut_off = similarity_cut_off if isinstance(similarity_cut_off, float) else similarity_cut_off[key]
                results.extend(self._filter_nodes_by_score(sims, topk, sim_cut_off, descend))
        results = list(set(results))
        LOG.debug(f'Retrieving query `{query}` and get results: {results}')
        return results

`remove(uids, group_name=None)`

Remove nodes with specified UIDs from the index. Optionally scoped to a group. This is a no-op placeholder and should be implemented in concrete usage.

Parameters:

uids (List[str]) –

List of unique IDs of nodes to remove.
group_name (Optional[str], default: None ) –

Optional group name to scope the removal.

Source code in lazyllm/tools/rag/default_index.py

    @override
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        """Remove nodes with specified UIDs from the index. Optionally scoped to a group. This is a no-op placeholder and should be implemented in concrete usage.

Args:
    uids (List[str]): List of unique IDs of nodes to remove.
    group_name (Optional[str]): Optional group name to scope the removal.
"""
        pass

`update(nodes)`

Update the index with the given list of document nodes. This is a placeholder implementation and should be provided/extended in concrete usage.

Parameters:

nodes (List[DocNode]) –

Document nodes to add or update in the index.

Source code in lazyllm/tools/rag/default_index.py

    @override
    def update(self, nodes: List[DocNode]) -> None:
        """Update the index with the given list of document nodes. This is a placeholder implementation and should be provided/extended in concrete usage.

Args:
    nodes (List[DocNode]): Document nodes to add or update in the index.
"""
        pass

`lazyllm.tools.Reranker`

Bases: ModuleBase, _PostProcess

Initializes a Rerank module for postprocessing and reranking of nodes (documents). This constructor initializes a Reranker module that configures a reranking process based on a specified reranking type. It allows for the dynamic selection and instantiation of reranking kernels (algorithms) based on the type and provided keyword arguments.

Parameters:

name (str, default: 'ModuleReranker' ) –

The type of reranker used for the postprocessing and reranking process. Defaults to 'ModuleReranker'.
target (str, default: None ) –

Deprecated parameter, only used to notify users.
output_format (Optional[str], default: None ) –

Specifies the output format. Defaults to None. Optional values include 'content' and 'dict'. - 'content' means the output is in string format. - 'dict' means the output is a dictionary.
join (Union[bool, str], default: False ) –

Determines whether to join the top-k output nodes. - When output_format is 'content': - If set to True, returns a single long string. - If set to False, returns a list of strings, each representing one node’s content. - When output_format is 'dict': - Joining is not supported; join defaults to False. - Returns a dictionary with three keys: 'content', 'embedding', and 'metadata'.
kwargs –

Additional keyword arguments passed to the reranker upon instantiation.

Detailed explanation of reranker types

Reranker: Instantiates a SentenceTransformerRerank reranker with a list of document nodes and a query.
KeywordFilter: This registered reranking function instantiates a KeywordNodePostprocessor with specified required and excluded keywords. It filters nodes based on the presence or absence of these keywords.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, Reranker, Retriever, DocNode
>>> m = lazyllm.OnlineEmbeddingModule()
>>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> retriever = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
>>> reranker = Reranker(DocNode(text=user_data),query="user query")
>>> ppl = lazyllm.ActionModule(retriever, reranker)
>>> ppl.start()
>>> print(ppl("user query"))

Source code in lazyllm/tools/rag/rerank.py

class Reranker(ModuleBase, _PostProcess):
    """Initializes a Rerank module for postprocessing and reranking of nodes (documents).
This constructor initializes a Reranker module that configures a reranking process based on a specified reranking type. It allows for the dynamic selection and instantiation of reranking kernels (algorithms) based on the type and provided keyword arguments.

Args:
    name: The type of reranker used for the postprocessing and reranking process. Defaults to 'ModuleReranker'.
    target (str): **Deprecated** parameter, only used to notify users.
    output_format: Specifies the output format. Defaults to None. Optional values include 'content' and 'dict'.
        - 'content' means the output is in string format.
        - 'dict' means the output is a dictionary.
    join: Determines whether to join the top-k output nodes.
        - When `output_format` is 'content':
            - If set to True, returns a single long string.
            - If set to False, returns a list of strings, each representing one node’s content.
        - When `output_format` is 'dict':
            - Joining is not supported; `join` defaults to False.
            - Returns a dictionary with three keys: 'content', 'embedding', and 'metadata'.
    kwargs: Additional keyword arguments passed to the reranker upon instantiation.

**Detailed explanation of reranker types**

- Reranker: Instantiates a `SentenceTransformerRerank` reranker with a list of document nodes and a query.

- KeywordFilter: This registered reranking function instantiates a KeywordNodePostprocessor with specified required and excluded keywords. It filters nodes based on the presence or absence of these keywords.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, Reranker, Retriever, DocNode
    >>> m = lazyllm.OnlineEmbeddingModule()
    >>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> retriever = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
    >>> reranker = Reranker(DocNode(text=user_data),query="user query")
    >>> ppl = lazyllm.ActionModule(retriever, reranker)
    >>> ppl.start()
    >>> print(ppl("user query"))
    """
    registered_reranker = dict()

    def __new__(cls, name: str = 'ModuleReranker', *args, **kwargs):
        assert name in cls.registered_reranker, f'Reranker: {name} is not registered, please register first.'
        item = cls.registered_reranker[name]
        if isinstance(item, type) and issubclass(item, Reranker):
            return super(Reranker, cls).__new__(item)
        else:
            return super(Reranker, cls).__new__(cls)

    def __init__(self, name: str = 'ModuleReranker', target: Optional[str] = None,
                 output_format: Optional[str] = None, join: Union[bool, str] = False, **kwargs) -> None:
        super().__init__()
        self._name = name
        self._kwargs = kwargs
        lazyllm.deprecated(bool(target), '`target` parameter of reranker')
        _PostProcess.__init__(self, output_format, join)

    def forward(self, nodes: List[DocNode], query: str = '', topk: Optional[int] = None) -> List[DocNode]:
        kwargs = dict(self._kwargs)
        if topk is not None:
            kwargs['topk'] = topk
        results = self.registered_reranker[self._name](nodes, query=query, **kwargs)
        LOG.debug(f'Rerank use `{self._name}` and get nodes: {results}')
        return self._post_process(results)

    @classmethod
    def register_reranker(
        cls: 'Reranker', func: Optional[Callable] = None, batch: bool = False
    ):
        """A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the `Reranker` class.

Args:
    func (Optional[Callable]): The reranking function or class to register. This can be omitted when using decorator syntax (@).
    batch (bool): Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.


Examples:

    @Reranker.register_reranker
    def my_reranker(node: DocNode, **kwargs):
        return node.score * 0.8  # 自定义分数计算
    """
        def decorator(f):
            if isinstance(f, type):
                cls.registered_reranker[f.__name__] = f
                return f
            else:
                def wrapper(nodes, **kwargs):
                    if batch:
                        return f(nodes, **kwargs)
                    else:
                        results = [f(node, **kwargs) for node in nodes]
                        return [result for result in results if result]

                cls.registered_reranker[f.__name__] = wrapper
                return wrapper

        return decorator(func) if func else decorator

`register_reranker(func=None, batch=False)` `classmethod`

A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the Reranker class.

Parameters:

func (Optional[Callable], default: None ) –

The reranking function or class to register. This can be omitted when using decorator syntax (@).
batch (bool, default: False ) –

Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.

Examples:

@Reranker.register_reranker
def my_reranker(node: DocNode, **kwargs):
    return node.score * 0.8  # 自定义分数计算

Source code in lazyllm/tools/rag/rerank.py

    @classmethod
    def register_reranker(
        cls: 'Reranker', func: Optional[Callable] = None, batch: bool = False
    ):
        """A class decorator factory method that provides a flexible mechanism for registering custom reranking algorithms to the `Reranker` class.

Args:
    func (Optional[Callable]): The reranking function or class to register. This can be omitted when using decorator syntax (@).
    batch (bool): Whether to process nodes in batches. Defaults to False, meaning nodes are processed individually.


Examples:

    @Reranker.register_reranker
    def my_reranker(node: DocNode, **kwargs):
        return node.score * 0.8  # 自定义分数计算
    """
        def decorator(f):
            if isinstance(f, type):
                cls.registered_reranker[f.__name__] = f
                return f
            else:
                def wrapper(nodes, **kwargs):
                    if batch:
                        return f(nodes, **kwargs)
                    else:
                        results = [f(node, **kwargs) for node in nodes]
                        return [result for result in results if result]

                cls.registered_reranker[f.__name__] = wrapper
                return wrapper

        return decorator(func) if func else decorator

`lazyllm.tools.Retriever`

Bases: _RetrieverBase, _PostProcess

Create a retrieval module for document querying and retrieval. This constructor initializes a retrieval module that configures the document retrieval process based on the specified similarity metric.

Parameters:

doc (object) –

An instance of the document module. The document module can be a single instance or a list of instances. If it is a single instance, it means searching for a single Document, and if it is a list of instances, it means searching for multiple Documents.
group_name (str) –

The name of the node group on which to perform the retrieval.
similarity (Optional[str], default: None ) –

The similarity function to use for setting up document retrieval. Defaults to 'dummy'. Candidates include ["bm25", "bm25_chinese", "cosine"].
similarity_cut_off (Union[float, Dict[str, float]], default: float('-inf') ) –

Discard the document when the similarity is below the specified value. In a multi-embedding scenario, if you need to specify different values for different embeddings, you need to specify them in a dictionary, where the key indicates which embedding is specified and the value indicates the corresponding threshold. If all embeddings use the same threshold, you only need to specify one value.
index (str, default: 'default' ) –

The type of index to use for document retrieval. Currently, only 'default' is supported.
topk (int, default: 6 ) –

The number of documents to retrieve with the highest similarity.
embed_keys (Optional[List[str]], default: None ) –

Indicates which embeddings are used for retrieval. If not specified, all embeddings are used for retrieval.
target (Optional[str], default: None ) –

The name of the target document group for result conversion
output_format (Optional[str], default: None ) –

Represents the output format, with a default value of None. Optional values include 'content' and 'dict', where 'content' corresponds to a string output format and 'dict' corresponds to a dictionary.
join (Union[bool, str], default: False ) –

Determines whether to concatenate the output of k nodes - when output format is 'content', setting True returns a single concatenated string while False returns a list of strings (each corresponding to a node's text content); when output format is 'dict', joining is unsupported (join defaults to False) and the output will be a dictionary containing 'content', 'embedding' and 'metadata' keys.

The group_name has three built-in splitting strategies, all of which use SentenceSplitter for splitting, with the difference being in the chunk size:

CoarseChunk: Chunk size is 1024, with an overlap length of 100
MediumChunk: Chunk size is 256, with an overlap length of 25
FineChunk: Chunk size is 128, with an overlap length of 12

Also, Image is available for group_name since LazyLLM supports image embedding and retrieval.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Retriever, Document, SentenceSplitter
>>> m = lazyllm.OnlineEmbeddingModule()
>>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> rm = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
>>> rm.start()
>>> print(rm("user query"))
>>> m1 = lazyllm.TrainableModule('bge-large-zh-v1.5').start()
>>> document1 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
>>> document1.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
>>> retriever = Retriever(document1, group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
>>> print(retriever("user query"))
>>> document2 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
>>> document2.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=512, chunk_overlap=50)
>>> retriever2 = Retriever([document1, document2], group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
>>> print(retriever2("user query"))
>>>
>>> filters = {
>>>     "author": ["A", "B", "C"],
>>>     "public_year": [2002, 2003, 2004],
>>> }
>>> document3 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
>>> document3.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=512, chunk_overlap=50)
>>> retriever3 = Retriever([document1, document3], group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
>>> print(retriever3(query="user query", filters=filters))
>>> document4 = Document(dataset_path='/path/to/user/data', embed=lazyllm.TrainableModule('siglip'))
>>> retriever4 = Retriever(document4, group_name='Image', similarity='cosine')
>>> nodes = retriever4("user query")
>>> print([node.get_content() for node in nodes])
>>> document5 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> rm = Retriever(document5, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='content')
>>> rm.start()
>>> print(rm("user query"))
>>> document6 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> rm = Retriever(document6, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='content', join=True)
>>> rm.start()
>>> print(rm("user query"))
>>> document7 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
>>> rm = Retriever(document7, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='dict')
>>> rm.start()
>>> print(rm("user query"))

Source code in lazyllm/tools/rag/retriever.py

class Retriever(_RetrieverBase, _PostProcess):
    """
Create a retrieval module for document querying and retrieval. This constructor initializes a retrieval module that configures the document retrieval process based on the specified similarity metric.

Args:
    doc: An instance of the document module. The document module can be a single instance or a list of instances. If it is a single instance, it means searching for a single Document, and if it is a list of instances, it means searching for multiple Documents.
    group_name: The name of the node group on which to perform the retrieval.
    similarity: The similarity function to use for setting up document retrieval. Defaults to 'dummy'. Candidates include ["bm25", "bm25_chinese", "cosine"].
    similarity_cut_off: Discard the document when the similarity is below the specified value. In a multi-embedding scenario, if you need to specify different values for different embeddings, you need to specify them in a dictionary, where the key indicates which embedding is specified and the value indicates the corresponding threshold. If all embeddings use the same threshold, you only need to specify one value.
    index: The type of index to use for document retrieval. Currently, only 'default' is supported.
    topk: The number of documents to retrieve with the highest similarity.
    embed_keys: Indicates which embeddings are used for retrieval. If not specified, all embeddings are used for retrieval.
    target:The name of the target document group for result conversion
    output_format: Represents the output format, with a default value of None. Optional values include 'content' and 'dict', where 'content' corresponds to a string output format and 'dict' corresponds to a dictionary.
    join:  Determines whether to concatenate the output of k nodes - when output format is 'content', setting True returns a single concatenated string while False returns a list of strings (each corresponding to a node's text content); when output format is 'dict', joining is unsupported (join defaults to False) and the output will be a dictionary containing 'content', 'embedding' and 'metadata' keys.

The `group_name` has three built-in splitting strategies, all of which use `SentenceSplitter` for splitting, with the difference being in the chunk size:

- CoarseChunk: Chunk size is 1024, with an overlap length of 100
- MediumChunk: Chunk size is 256, with an overlap length of 25
- FineChunk: Chunk size is 128, with an overlap length of 12

Also, `Image` is available for `group_name` since LazyLLM supports image embedding and retrieval.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Retriever, Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule()
    >>> documents = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> rm = Retriever(documents, group_name='CoarseChunk', similarity='bm25', similarity_cut_off=0.01, topk=6)
    >>> rm.start()
    >>> print(rm("user query"))
    >>> m1 = lazyllm.TrainableModule('bge-large-zh-v1.5').start()
    >>> document1 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
    >>> document1.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    >>> retriever = Retriever(document1, group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
    >>> print(retriever("user query"))
    >>> document2 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
    >>> document2.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=512, chunk_overlap=50)
    >>> retriever2 = Retriever([document1, document2], group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
    >>> print(retriever2("user query"))
    >>>
    >>> filters = {
    >>>     "author": ["A", "B", "C"],
    >>>     "public_year": [2002, 2003, 2004],
    >>> }
    >>> document3 = Document(dataset_path='/path/to/user/data', embed={'online':m , 'local': m1}, manager=False)
    >>> document3.create_node_group(name='sentences', transform=SentenceSplitter, chunk_size=512, chunk_overlap=50)
    >>> retriever3 = Retriever([document1, document3], group_name='sentences', similarity='cosine', similarity_cut_off=0.4, embed_keys=['local'], topk=3)
    >>> print(retriever3(query="user query", filters=filters))
    >>> document4 = Document(dataset_path='/path/to/user/data', embed=lazyllm.TrainableModule('siglip'))
    >>> retriever4 = Retriever(document4, group_name='Image', similarity='cosine')
    >>> nodes = retriever4("user query")
    >>> print([node.get_content() for node in nodes])
    >>> document5 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> rm = Retriever(document5, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='content')
    >>> rm.start()
    >>> print(rm("user query"))
    >>> document6 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> rm = Retriever(document6, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='content', join=True)
    >>> rm.start()
    >>> print(rm("user query"))
    >>> document7 = Document(dataset_path='/path/to/user/data', embed=m, manager=False)
    >>> rm = Retriever(document7, group_name='CoarseChunk', similarity='bm25_chinese', similarity_cut_off=0.01, topk=3, output_format='dict')
    >>> rm.start()
    >>> print(rm("user query"))
    """
    def __init__(self, doc: object, group_name: str, similarity: Optional[str] = None,
                 similarity_cut_off: Union[float, Dict[str, float]] = float('-inf'), index: str = 'default',
                 topk: int = 6, embed_keys: Optional[List[str]] = None, target: Optional[str] = None,
                 output_format: Optional[str] = None, join: Union[bool, str] = False,
                 weight: Optional[float] = None, priority: Optional[_RetrieverBase.Priority] = None, **kwargs):
        super().__init__()
        if similarity:
            if similarity not in registered_similarities:
                raise ValueError(
                    f"Unregistered similarity: '{similarity}'. "
                    f'Available options are: {list(registered_similarities.keys())}'
                )
            _, mode, _ = registered_similarities[similarity]
        else:
            similarity = 'cosine'
            mode = 'embedding'  # TODO FIXME XXX should be removed after similarity args refactor
        group_name, target = str(group_name), (str(target) if target else None)

        self._docs: List[Document] = [doc] if isinstance(doc, Document) else doc
        # NOTE: multi docs is deprecated and will be removed in the future
        if len(self._docs) > 1:
            LOG.warning('[Retriever] Multi docs is deprecated and will be removed in the future,'
                        ' please use multiple Retrievers instead.')
        self._group_name = group_name
        if index == 'smart_embedding_index':
            index = 'default'
            LOG.warning('[Retriever] `smart_embedding_index` is deprecated, converted to `default`')
        self._mode = mode
        self._index = index
        self._topk = topk
        self._similarity = similarity  # similarity function str
        self._similarity_kw = kwargs  # kw parameters
        self._similarity_cut_off = similarity_cut_off
        self._embed_keys = embed_keys
        self._per_doc_embed_keys = False
        self._target = target
        self._weight, self._priority = weight, priority
        if weight or priority:
            assert not (weight and priority), f'Cannot provide weight({weight}) and priority({priority}) together!'
            assert not output_format or not join, 'shouldn\'t provide output_format/join when weight or priority is set'

        self._init_submodules_and_embed_keys()
        _PostProcess.__init__(self, output_format, join)

    weight = property(lambda self: self._weight)
    priority = property(lambda self: self._priority)

    @once_wrapper
    def _lazy_init(self):
        docs = []
        per_doc_embed_keys = [] if self._per_doc_embed_keys else None
        for idx, doc in enumerate(self._docs):
            if isinstance(doc, UrlDocument) or self._group_name in doc._impl.node_groups \
                    or self._group_name in DocImpl._builtin_node_groups \
                    or self._group_name in DocImpl._global_node_groups:
                docs.append(doc)
                if self._per_doc_embed_keys:
                    per_doc_embed_keys.append(self._embed_keys[idx])
        if not docs: raise RuntimeError(f'Group {self._group_name} not found in document {self._docs}')
        self._docs = docs
        if self._per_doc_embed_keys:
            self._embed_keys = per_doc_embed_keys

    def _init_submodules_and_embed_keys(self):
        group_name = self._group_name
        embed_keys = self._embed_keys
        self._per_doc_embed_keys = (not embed_keys and self._mode == 'embedding')
        if self._per_doc_embed_keys:
            # NOTE: store per-doc embed keys aligned with self._docs order
            self._embed_keys = []
        for doc in self._docs:
            assert isinstance(doc, (Document, UrlDocument)), 'Only Document or List[Document] are supported'
            if isinstance(doc, UrlDocument):
                if embed_keys:
                    self._validate_remote_vec_retr_params(doc, group_name, embed_keys)
                else:
                    group_name, doc_embed_keys = self._validate_remote_vec_retr_params(doc, group_name, None)
                    if self._per_doc_embed_keys:
                        self._embed_keys.append(doc_embed_keys)
                continue
            self._submodules.append(doc)
            if self._per_doc_embed_keys:
                doc_embed_keys = list(doc._impl.embed.keys())
                self._embed_keys.append(doc_embed_keys)
            else:
                doc_embed_keys = embed_keys
            doc.activate_group(group_name, doc_embed_keys)
            if self._target: doc.activate_group(self._target)

    def __getstate__(self):
        state = {'group_name': self._group_name, 'similarity': self._similarity,
                 'similarity_cut_off': self._similarity_cut_off, 'index': self._index, 'topk': self._topk,
                 'similarity_kw': self._similarity_kw, 'embed_keys': self._embed_keys, 'target': self._target,
                 'output_format': self._output_format, 'join': self._join,
                 'per_doc_embed_keys': self._per_doc_embed_keys}
        docs = []
        for doc in self._docs:
            if isinstance(doc, UrlDocument):
                docs.append({'url': doc._manager._url, 'name': doc._curr_group})
            else:
                assert isinstance(doc._manager._kbs, lazyllm.ServerModule), \
                    'Only UrlDocument and Document with ServerModule are supported'
                docs.append({'url': doc._manager._kbs._url, 'name': doc._curr_group})
        state['docs'] = docs
        return state

    def __setstate__(self, state):
        ModuleBase.__init__(self)
        self._group_name = state['group_name']
        self._similarity = state['similarity']
        self._similarity_cut_off = state['similarity_cut_off']
        self._index = state['index']
        self._topk = state['topk']
        self._similarity_kw = state['similarity_kw']
        self._embed_keys = state['embed_keys']
        self._per_doc_embed_keys = state.get('per_doc_embed_keys', False)
        self._target = state['target']
        self._output_format = state['output_format']
        self._join = state['join']
        self._docs = [Document(url=doc['url'], name=doc['name']) for doc in state['docs']]
        _PostProcess.__init__(self, self._output_format, self._join)

    def _validate_remote_vec_retr_params(self, doc: UrlDocument, group_name, embed_keys: Optional[List[str]] = None):
        active_groups = doc.active_node_groups
        if not active_groups:
            raise RuntimeError(f'No active groups found in document {doc._manager._url}')
        if group_name not in active_groups:
            raise RuntimeError(f'Group {group_name} not found or not activated in document {doc._manager._url}')
        if not embed_keys:
            resolved_embed_keys = list(active_groups[group_name])
            return group_name, resolved_embed_keys
        else:
            for k in embed_keys:
                if k not in active_groups[group_name]:
                    raise RuntimeError(f'Embedding key {k} not found in group {group_name} '
                                       f'from document {doc._manager._url},'
                                       f'available keys: {list(active_groups[group_name])}')
            return group_name, embed_keys

    def forward(
            self, query: str, filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
            topk: Optional[int] = None, **kwargs
    ) -> Union[List[DocNode], str]:
        self._lazy_init()
        resolved_topk = self._topk if topk is None else topk
        all_nodes: List[DocNode] = []
        if self._per_doc_embed_keys:
            if len(self._embed_keys) != len(self._docs):
                raise RuntimeError('Per-doc embed_keys misaligned with docs after lazy init')
        for idx, doc in enumerate(self._docs):
            embed_keys = self._embed_keys[idx] if self._per_doc_embed_keys else self._embed_keys
            nodes = doc.forward(query=query, group_name=self._group_name, similarity=self._similarity,
                                similarity_cut_off=self._similarity_cut_off, index=self._index,
                                topk=resolved_topk, similarity_kws=self._similarity_kw, embed_keys=embed_keys,
                                filters=filters, **kwargs)
            if nodes and self._target and self._target != nodes[0]._group:
                nodes = doc.find(self._target)(nodes)
            all_nodes.extend(nodes)
        return self._post_process(all_nodes)

`lazyllm.tools.rag.retriever.TempDocRetriever`

Bases: TempRetriever

A temporary document retriever that inherits from TempRetriever, used for quickly processing temporary files and performing retrieval tasks.

Parameters:

embed (Callable, default: None ) –

The embedding function.
output_format (Optional[str], default: None ) –

The format of the output result (e.g., JSON). Optional, defaults to None.
join (Union[bool, str], default: False ) –

Whether to merge multiple result segments (set to True or specify a separator like "

").

Examples:

>>> import lazyllm
>>> from lazyllm.tools import TempDocRetriever, Document, SentenceSplitter
>>> retriever = TempDocRetriever(output_format="text", join="
---------------
")
    retriever.create_node_group(transform=lambda text: [s.strip() for s in text.split("。") if s] )
    retriever.add_subretriever(group=Document.MediumChunk, topk=3)
    files = ["/path/to/file.txt"]
    results = retriever.forward(files, "什么是机器学习?")
    print(results)

Source code in lazyllm/tools/rag/retriever.py

class TempDocRetriever(TempRetriever):
    """
A temporary document retriever that inherits from TempRetriever, used for quickly processing temporary files and performing retrieval tasks.

Args:
    embed: The embedding function.
    output_format: The format of the output result (e.g., JSON). Optional, defaults to None.
    join: Whether to merge multiple result segments (set to True or specify a separator like "
").


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import TempDocRetriever, Document, SentenceSplitter
    >>> retriever = TempDocRetriever(output_format="text", join="
    ---------------
    ")
        retriever.create_node_group(transform=lambda text: [s.strip() for s in text.split("。") if s] )
        retriever.add_subretriever(group=Document.MediumChunk, topk=3)
        files = ["/path/to/file.txt"]
        results = retriever.forward(files, "什么是机器学习?")
        print(results)
    """
    @functools.lru_cache(maxsize=128)  # noqa B019
    def _get_retrievers(self, doc_files: List[str]):
        return self._get_retrievers_impl(doc_files)

    def __del__(self):
        self._get_retrievers.cache_clear()

`lazyllm.tools.rag.retriever.UrlDocument`

Bases: ModuleBase

UrlDocument class inherits from ModuleBase, used to manage remote document resources by specifying a URL and a name. Internally delegates calls to lazyllm's UrlModule, supporting document find, retrieve, and querying active node groups.

Parameters:

url (str) –

Access URL for the remote document resource.
name (str, default: None ) –

Current document group name used to identify the document group.

Source code in lazyllm/tools/rag/document.py

class UrlDocument(ModuleBase):
    """UrlDocument class inherits from ModuleBase, used to manage remote document resources by specifying a URL and a name.
Internally delegates calls to lazyllm's UrlModule, supporting document find, retrieve, and querying active node groups.

Args:
    url (str): Access URL for the remote document resource.
    name (str): Current document group name used to identify the document group.
"""
    def __init__(self, url: str, name: str = None):
        super().__init__()
        self._missing_keys = set(dir(Document)) - set(dir(UrlDocument))
        self._manager = lazyllm.UrlModule(url=ensure_call_endpoint(url))
        self._curr_group = name or RAG_DEFAULT_GROUP_NAME

    def _forward(self, func_name: str, *args, **kwargs):
        args = (self._curr_group, func_name, *args)
        return self._manager._call('__call__', *args, **kwargs)

    def find(self, target) -> Callable:
        """Creates a partially applied function to find a specified target within the current document group.

Args:
    target (str): The target identifier to find.

**Returns:**

- Callable: A partially applied function that executes the find operation when called.
"""
        return functools.partial(self._forward, 'find', group=target)

    def forward(self, *args, **kw):
        return self._forward('retrieve', *args, **kw)

    def get_nodes(self, uids: Optional[List[str]] = None, doc_ids: Optional[Set] = None,
                  group: Optional[str] = None, kb_id: Optional[str] = None, numbers: Optional[Set] = None,
                  limit: Optional[int] = None, offset: int = 0, return_total: bool = False,
                  sort_by_number: bool = False) -> Union[List[DocNode], Tuple[List[DocNode], int]]:
        """Get remote document nodes by criteria.

Args:
    uids (Optional[List[str]]): List of node uids to fetch.
    doc_ids (Optional[Set]): Set of document ids to filter by.
    group (Optional[str]): Node group name.
    kb_id (Optional[str]): Knowledge base id.
    numbers (Optional[Set]): Set of node numbers.

**Returns:**

- List[DocNode]: Matched nodes.
"""
        return self._forward(
            '_get_nodes', uids, doc_ids, group, kb_id, numbers, limit, offset, return_total, sort_by_number,
        )

    def get_window_nodes(self, node: DocNode, span: tuple[int, int] = (-5, 5),
                         merge: bool = False) -> Union[List[DocNode], DocNode]:
        """Get window nodes around a target node in a remote document.

Args:
    node (DocNode): Target node.
    span (tuple[int, int]): Window range based on relative offsets of node.number.
    merge (bool): Whether to merge window nodes into a single node.

**Returns:**

- Union[List[DocNode], DocNode]: Window nodes list or a merged node.
"""
        return self._forward('_get_window_nodes', node, span, merge)

    @cached_property
    def active_node_groups(self):
        return self._forward('active_node_groups')

    def __getattr__(self, name):
        if name in self.__dict__.get('_missing_keys', []):
            raise AttributeError(f'Document generated with url and name has no attribute `{name}`')

`find(target)`

Creates a partially applied function to find a specified target within the current document group.

Parameters:

target (str) –

The target identifier to find.

Returns:

Callable: A partially applied function that executes the find operation when called.

Source code in lazyllm/tools/rag/document.py

    def find(self, target) -> Callable:
        """Creates a partially applied function to find a specified target within the current document group.

Args:
    target (str): The target identifier to find.

**Returns:**

- Callable: A partially applied function that executes the find operation when called.
"""
        return functools.partial(self._forward, 'find', group=target)

`get_nodes(uids=None, doc_ids=None, group=None, kb_id=None, numbers=None, limit=None, offset=0, return_total=False, sort_by_number=False)`

Get remote document nodes by criteria.

Parameters:

uids (Optional[List[str]], default: None ) –

List of node uids to fetch.
doc_ids (Optional[Set], default: None ) –

Set of document ids to filter by.
group (Optional[str], default: None ) –

Node group name.
kb_id (Optional[str], default: None ) –

Knowledge base id.
numbers (Optional[Set], default: None ) –

Set of node numbers.

Returns:

List[DocNode]: Matched nodes.

Source code in lazyllm/tools/rag/document.py

    def get_nodes(self, uids: Optional[List[str]] = None, doc_ids: Optional[Set] = None,
                  group: Optional[str] = None, kb_id: Optional[str] = None, numbers: Optional[Set] = None,
                  limit: Optional[int] = None, offset: int = 0, return_total: bool = False,
                  sort_by_number: bool = False) -> Union[List[DocNode], Tuple[List[DocNode], int]]:
        """Get remote document nodes by criteria.

Args:
    uids (Optional[List[str]]): List of node uids to fetch.
    doc_ids (Optional[Set]): Set of document ids to filter by.
    group (Optional[str]): Node group name.
    kb_id (Optional[str]): Knowledge base id.
    numbers (Optional[Set]): Set of node numbers.

**Returns:**

- List[DocNode]: Matched nodes.
"""
        return self._forward(
            '_get_nodes', uids, doc_ids, group, kb_id, numbers, limit, offset, return_total, sort_by_number,
        )

`get_window_nodes(node, span=(-5, 5), merge=False)`

Get window nodes around a target node in a remote document.

Parameters:

node (DocNode) –

Target node.
span (tuple[int, int], default: (-5, 5) ) –

Window range based on relative offsets of node.number.
merge (bool, default: False ) –

Whether to merge window nodes into a single node.

Returns:

Union[List[DocNode], DocNode]: Window nodes list or a merged node.

Source code in lazyllm/tools/rag/document.py

    def get_window_nodes(self, node: DocNode, span: tuple[int, int] = (-5, 5),
                         merge: bool = False) -> Union[List[DocNode], DocNode]:
        """Get window nodes around a target node in a remote document.

Args:
    node (DocNode): Target node.
    span (tuple[int, int]): Window range based on relative offsets of node.number.
    merge (bool): Whether to merge window nodes into a single node.

**Returns:**

- Union[List[DocNode], DocNode]: Window nodes list or a merged node.
"""
        return self._forward('_get_window_nodes', node, span, merge)

`lazyllm.tools.rag.doc_service.DocServer`

Bases: ModuleBase

Primary entry point of the document service.

DocServer manages document upload/add/reparse/delete flows, task tracking, knowledge-base management, chunk inspection, and cross-kb transfer. It is the recommended replacement for the legacy DocManager / DocListManager APIs.

Parameters:

port (Optional[int], default: None ) –

Local service port when starting an in-process server.
url (Optional[str], default: None ) –

Existing doc_service URL. When provided, the instance works as a remote client.
parser_url (Optional[str], default: None ) –

Parsing service URL used by the local doc_service instance.
db_config (Optional[Dict[str, Any]], default: None ) –

Metadata database configuration for doc_service.
parser_db_config (Optional[Dict[str, Any]], default: None ) –

Parsing task database configuration for the parsing service.
parser_poll_interval (float, default: 0.05 ) –

Poll interval used by local parser coordination.
storage_dir (Optional[str], default: None ) –

Local storage directory for uploaded files.
callback_url (Optional[str], default: None ) –

Callback URL used to receive parsing task updates.
launcher –

Launcher used to start local services.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

class DocServer(ModuleBase):
    """Primary entry point of the document service.

``DocServer`` manages document upload/add/reparse/delete flows, task tracking, knowledge-base management,
chunk inspection, and cross-kb transfer. It is the recommended replacement for the legacy ``DocManager`` /
``DocListManager`` APIs.

Args:
    port (Optional[int]): Local service port when starting an in-process server.
    url (Optional[str]): Existing doc_service URL. When provided, the instance works as a remote client.
    parser_url (Optional[str]): Parsing service URL used by the local doc_service instance.
    db_config (Optional[Dict[str, Any]]): Metadata database configuration for doc_service.
    parser_db_config (Optional[Dict[str, Any]]): Parsing task database configuration for the parsing service.
    parser_poll_interval (float): Poll interval used by local parser coordination.
    storage_dir (Optional[str]): Local storage directory for uploaded files.
    callback_url (Optional[str]): Callback URL used to receive parsing task updates.
    launcher: Launcher used to start local services.
"""
    class _Impl:
        def __init__(
            self,
            storage_dir: str,
            db_config: Optional[Dict[str, Any]] = None,
            parser_db_config: Optional[Dict[str, Any]] = None,
            parser_poll_interval: float = 0.05,
            parser_url: Optional[str] = None,
            callback_url: Optional[str] = None,
            enable_scan: bool = False,
            scan_interval: int = 10,
        ):
            if not parser_url:
                raise ValueError('parser_url is required; doc_service no longer starts a mock parsing server')
            self._storage_dir = storage_dir
            self._db_config = db_config
            self._parser_db_config = parser_db_config
            self._parser_poll_interval = parser_poll_interval
            self._parser_url = parser_url
            self._callback_url = callback_url
            self._parser = None
            self._manager = None
            self._enable_scan = enable_scan
            self._scan_interval = scan_interval
            self._scan_thread = None
            self._scan_continue = False
            self._owned_kbs: Set[str] = set()

        @once_wrapper(reset_on_pickle=True)
        def _lazy_init(self):
            if self._storage_dir and not os.path.exists(self._storage_dir):
                os.makedirs(self._storage_dir, exist_ok=True)
            self._manager = DocManager(
                db_config=self._db_config,
                parser_url=self._parser_url,
                callback_url=self._callback_url,
            )
            # NOTE: scanning is NOT started here.  Use ``enable_scanning()`` after
            # all KB registrations and parser algorithm registrations are complete
            # so the first scan sees a consistent _owned_kbs set and can route
            # requests to algorithms that actually exist on the parser.
            # For standalone / direct DocServer usage with enable_scan=True and no
            # explicit ``enable_scanning()`` call, the scan thread is started lazily
            # on the first ``_sync_dataset()`` invocation if still not running.

        def stop(self):
            self._scan_continue = False
            if self._scan_thread and self._scan_thread.is_alive():
                self._scan_thread.join(timeout=2)
            # Release the DB engine so callers can clean up the backing directory
            # (sqlite on Windows keeps an exclusive handle until dispose()).
            if self._manager is not None:
                try:
                    self._manager.close()
                except Exception:
                    pass
            return None

        def _sync_dataset_for_kb(self, kb_id: str, algo_id: str, disk_files: list, disk_set: set):
            """Sync one KB: diff disk vs documents table -> upload new / delete stale via unified pipeline."""
            # For retry: exclude FAILED/CANCELED so they get re-uploaded
            synced_docs = self._manager._list_kb_docs_by_path(kb_id, exclude_failed=True)
            # For stale cleanup: include FAILED/CANCELED so removed files get cleaned up
            all_known_docs = self._manager._list_kb_docs_by_path(kb_id, exclude_failed=False)

            # New files (or previously failed) → upload(source_type=SCAN)
            new_paths = [p for p in disk_files if p not in synced_docs]
            if new_paths:
                try:
                    request = UploadRequest(
                        items=[AddFileItem(file_path=p) for p in new_paths],
                        kb_id=kb_id, algo_id=algo_id, source_type=SourceType.SCAN,
                    )
                    self._manager.upload(request)
                except Exception as exc:
                    LOG.error(f'[Scan] upload failed for kb={kb_id}: {len(new_paths)} files: {exc}')

            # Stale files (including failed ones whose source file was removed) → delete
            stale_ids = [did for path, did in all_known_docs.items() if path not in disk_set]
            if stale_ids:
                try:
                    request = DeleteRequest(doc_ids=stale_ids, kb_id=kb_id)
                    self._manager.delete(request)
                except Exception as exc:
                    LOG.error(f'[Scan] delete failed for kb={kb_id}: {len(stale_ids)} docs: {exc}')

            if new_paths or stale_ids:
                LOG.info(f'[Scan] kb={kb_id} sync done: added={len(new_paths)}, deleted={len(stale_ids)}')

        def _sync_dataset(self):
            """One-shot scan: list dir -> sync all active KB+algo pairs."""
            from .utils import list_dataset_files
            disk_files = list_dataset_files(self._storage_dir)
            disk_set = set(disk_files)

            kb_algo_pairs = self._manager._list_active_kb_algo_pairs()
            if not kb_algo_pairs:
                kb_algo_pairs = [('__default__', '__default__')]

            # When this instance has explicitly registered KBs, only scan those
            # to avoid processing KBs that belong to other Document instances
            # sharing the same global DB.
            owned = self._owned_kbs.copy()
            if owned:
                kb_algo_pairs = [(kb, algo) for kb, algo in kb_algo_pairs if kb in owned]

            for kb_id, algo_id in kb_algo_pairs:
                try:
                    self._sync_dataset_for_kb(kb_id, algo_id, disk_files, disk_set)
                except Exception as exc:
                    LOG.error(f'[Scan] sync failed for kb={kb_id}, algo={algo_id}: {exc}')

        def _scan_worker(self):
            """Daemon thread: periodically scan dataset directory."""
            while self._scan_continue:
                try:
                    self._sync_dataset()
                except Exception as exc:
                    LOG.error(f'[Scan] sync failed: {exc}')
                time.sleep(self._scan_interval)

        def _start_scan_monitoring(self):
            if self._scan_thread and self._scan_thread.is_alive():
                return
            self._scan_continue = True
            self._scan_thread = threading.Thread(target=self._scan_worker, daemon=True)
            self._scan_thread.start()

        def enable_scanning(self):
            """Start scanning after all KB registrations and parser algo registrations
            are complete.  Safe to call multiple times (idempotent).

            This is the intended way for ``Document._Manager`` to trigger the first
            scan: it ensures ``_owned_kbs`` is fully populated and all algorithms
            have been registered with the parser before any file-level sync happens.
            """
            self._lazy_init()
            if not self._enable_scan:
                return
            if not (self._storage_dir and os.path.isdir(self._storage_dir)):
                return
            self._sync_dataset()
            self._start_scan_monitoring()

        def ensure_kb_registered(self, kb_id: str, algo_id: Optional[str] = None):
            """Lightweight KB registration: ensure KB + algo binding rows exist in DB.

            Unlike ``create_kb_by_id`` this does NOT validate algorithm existence
            against the parser, so it can be called before the algorithm is registered
            (e.g. during ``add_kb_group`` which creates a DocImpl that will register
            its algorithm later during ``_lazy_init``).
            """
            self._lazy_init()
            algo_id = algo_id or kb_id
            self._manager._ensure_kb(kb_id, display_name=kb_id)
            self._manager._ensure_kb_algorithm(kb_id, algo_id)
            self._owned_kbs.add(kb_id)

        def set_runtime_callback_url(self, callback_url: str):
            self._lazy_init()
            self._manager.set_callback_url(callback_url)

        @staticmethod
        def _response(data=None, code=200, msg='success', status_code=200):
            payload = BaseResponse(code=code, msg=msg, data=data).model_dump(mode='json')
            return fastapi.responses.JSONResponse(status_code=status_code, content=payload)

        def _run(self, func, *args, success_msg='success', **kwargs):
            try:
                data = func(*args, **kwargs)
                return self._response(data=data, msg=success_msg)
            except DocServiceError as exc:
                data = dict(exc.data or {})
                data.setdefault('biz_code', exc.biz_code)
                return self._response(data=data, code=exc.http_status, msg=exc.msg, status_code=exc.http_status)
            except fastapi.HTTPException as exc:
                detail = exc.detail if isinstance(exc.detail, dict) else {}
                data = detail.get('data')
                if isinstance(data, dict) and 'biz_code' not in data and detail.get('code'):
                    data['biz_code'] = detail['code']
                code = exc.status_code
                msg = detail.get('msg', str(exc.detail))
                return self._response(data=data, code=code, msg=msg, status_code=exc.status_code)

        @staticmethod
        def _build_upload_payload(request: UploadRequest, file_identities: Optional[List[Dict[str, Any]]] = None):
            source_type = request.source_type or SourceType.API
            items = file_identities
            if items is None:
                items = []
                for idx, item in enumerate(request.items):
                    content_hash = None
                    size_bytes = None
                    if os.path.exists(item.file_path):
                        content_hash = sha256_file(item.file_path)
                        size_bytes = os.path.getsize(item.file_path)
                    items.append({
                        'filename': os.path.basename(item.file_path),
                        'content_hash': content_hash,
                        'size_bytes': size_bytes,
                        'doc_id': item.doc_id if idx == 0 else None,
                    })
            return {
                'kb_id': request.kb_id,
                'algo_id': None,  # deprecated; kept for idempotency-key stability
                'source_type': source_type.value,
                'idempotency_key': request.idempotency_key,
                'items': items,
            }

        @staticmethod
        def _build_update_kb_payload(kb_id: str, request: KbUpdateRequest):
            payload = request.model_dump(mode='json', exclude_unset=True)
            payload['kb_id'] = kb_id
            payload['explicit_fields'] = sorted(field for field in request.model_fields_set if field != 'kb_id')
            return payload

        def _gen_unique_upload_path(
            self, filename: str, reserved_paths: Optional[set] = None,
            *, base_dir: Optional[str] = None,
        ):
            safe_name = os.path.basename(filename) or 'upload.bin'
            target_dir = base_dir or self._storage_dir
            file_path = os.path.join(target_dir, safe_name)
            reserved_paths = reserved_paths or set()
            if file_path not in reserved_paths and not os.path.exists(file_path):
                return file_path

            suffix = os.path.splitext(safe_name)[1]
            prefix = safe_name[:-len(suffix)] if suffix else safe_name
            for idx in range(1, 10000):
                candidate = os.path.join(target_dir, f'{prefix}-{idx}{suffix}')
                if candidate not in reserved_paths and not os.path.exists(candidate):
                    return candidate
            digest = hashlib.sha256(safe_name.encode()).hexdigest()[:8]
            return os.path.join(target_dir, f'{prefix}-{digest}{suffix}')

        @staticmethod
        async def _save_upload_file(upload_file: 'fastapi.UploadFile', file_path: str):
            with open(file_path, 'wb') as fh:
                while True:
                    chunk = await upload_file.read(1024 * 1024)
                    if not chunk:
                        break
                    fh.write(chunk)
            await upload_file.close()

        async def _persist_uploads(
            self, files: List[fastapi.UploadFile], *,
            override: bool = False, sub_dir: str = '',
        ):
            saved_paths = []
            file_identities = []
            reserved_paths: Set[str] = set()
            target_dir = os.path.join(self._storage_dir, sub_dir) if sub_dir else self._storage_dir
            if sub_dir:
                os.makedirs(target_dir, exist_ok=True)
            for upload_file in files:
                filename = getattr(upload_file, 'filename', None) or 'upload.bin'
                if override:
                    # Legacy /upload_files behavior: write to
                    # ``storage_dir[/sub_dir]/<name>``, overwriting any
                    # existing file. ``DocManager.upload()`` derives
                    # ``doc_id`` from ``file_path``, so this lets a
                    # re-upload replace the existing document instead of
                    # creating a new one.
                    safe_name = os.path.basename(filename) or 'upload.bin'
                    file_path = os.path.join(target_dir, safe_name)
                    if file_path in reserved_paths:
                        # Two uploads in the same request collided on the same name;
                        # keep the unique-path fallback so we don't lose one of them.
                        file_path = self._gen_unique_upload_path(
                            filename, reserved_paths, base_dir=target_dir,
                        )
                else:
                    file_path = self._gen_unique_upload_path(
                        filename, reserved_paths, base_dir=target_dir,
                    )
                await self._save_upload_file(upload_file, file_path)
                reserved_paths.add(file_path)
                saved_paths.append(file_path)
                file_identities.append({
                    'filename': os.path.basename(file_path),
                    'content_hash': sha256_file(file_path),
                    'size_bytes': os.path.getsize(file_path),
                    'doc_id': None,
                })
            return saved_paths, file_identities

        def _run_upload(self, request: UploadRequest, payload: Optional[Dict[str, Any]] = None):
            idem_payload = payload or self._build_upload_payload(request)
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/upload', request.idempotency_key, idem_payload,
                lambda: {'items': self._manager.upload(request)}
            ))

        @staticmethod
        def _normalize_task_callback(callback: Any) -> TaskCallbackRequest:
            if isinstance(callback, TaskCallbackRequest):
                return callback
            if not isinstance(callback, dict):
                raise DocServiceError('E_INVALID_PARAM', 'invalid callback payload')

            payload = dict(callback.get('payload') or {})
            for field in ('task_type', 'doc_id', 'kb_id', 'algo_id'):
                if callback.get(field) is not None and field not in payload:
                    payload[field] = callback[field]

            event_type = callback.get('event_type')
            status = callback.get('status')
            task_status = callback.get('task_status')

            try:
                if status is not None:
                    normalized_status = DocStatus(status)
                    normalized_event_type = CallbackEventType(event_type) if event_type else (
                        CallbackEventType.START
                        if normalized_status in (DocStatus.WAITING, DocStatus.WORKING) else CallbackEventType.FINISH
                    )
                elif task_status is not None:
                    normalized_status = DocStatus(task_status)
                    normalized_event_type = CallbackEventType(event_type) if event_type else (
                        CallbackEventType.START
                        if normalized_status in (DocStatus.WAITING, DocStatus.WORKING)
                        else CallbackEventType.FINISH
                    )
                else:
                    raise DocServiceError('E_INVALID_PARAM', 'status or task_status is required')
            except ValueError as exc:
                raise DocServiceError('E_INVALID_PARAM', str(exc)) from exc

            callback_data = {
                'callback_id': callback.get('callback_id'),
                'task_id': callback.get('task_id'),
                'event_type': normalized_event_type,
                'status': normalized_status,
                'error_code': callback.get('error_code'),
                'error_msg': callback.get('error_msg'),
                'payload': payload,
            }
            return TaskCallbackRequest.model_validate({k: v for k, v in callback_data.items() if v is not None})

        @staticmethod
        def _format_task_view(task: Optional[Dict[str, Any]]):
            if not isinstance(task, dict):
                return task
            return dict(task)

        def _format_task_response_data(self, data: Any):
            if isinstance(data, dict) and isinstance(data.get('items'), list):
                payload = dict(data)
                payload['items'] = [self._format_task_view(item) for item in data['items']]
                return payload
            return self._format_task_view(data)

        def upload_request(self, request: UploadRequest):
            self._lazy_init()
            return self._run_upload(request)

        # Reserved metadata keys that the legacy /upload_files rejected; if a
        # client smuggles them into ``metadatas`` they would silently overwrite
        # internal docid/path tracking via ``parsing_service/impl.py``'s
        # ``setdefault`` calls and break later filter/delete/reparse flows.
        _LEGACY_RESERVED_META_KEYS = frozenset({'docid', 'doc_id', 'lazyllm_doc_path'})

        @classmethod
        def _parse_legacy_metadatas(cls, metadatas: Optional[str], expected_len: int) -> List[Dict[str, Any]]:
            """Validate and parse the legacy ``metadatas`` query param.

            Returns the parsed list (empty when ``metadatas`` is falsy). Raises
            HTTPException(400) on JSON / shape / length / reserved-key issues so
            the caller doesn't have to repeat each guard inline.
            """
            if not metadatas:
                return []
            try:
                parsed = json.loads(metadatas) or []
            except (ValueError, TypeError) as exc:
                raise fastapi.HTTPException(
                    status_code=400, detail=f'metadatas must be valid JSON: {exc}',
                )
            if not isinstance(parsed, list):
                raise fastapi.HTTPException(
                    status_code=400, detail='metadatas must be a JSON array',
                )
            # Legacy contract: rejects arrays with the wrong length or non-dict
            # entries with 400, instead of silently dropping/padding (which
            # would attach the wrong metadata to uploads) or letting AddFileItem
            # raise a 500.
            if len(parsed) != expected_len:
                raise fastapi.HTTPException(
                    status_code=400,
                    detail=f'metadatas length {len(parsed)} does not match files length {expected_len}',
                )
            for entry in parsed:
                if not isinstance(entry, dict):
                    raise fastapi.HTTPException(
                        status_code=400, detail='each metadatas entry must be a JSON object',
                    )
                bad = cls._LEGACY_RESERVED_META_KEYS.intersection(entry.keys())
                if bad:
                    raise fastapi.HTTPException(
                        status_code=400,
                        detail=f'metadatas contains reserved keys: {sorted(bad)}',
                    )
            return parsed

        @staticmethod
        def _normalize_legacy_user_path(user_path: Optional[str]) -> str:
            """Validate and normalize the legacy ``user_path`` query param.

            Rejects absolute paths or any value that climbs out of
            ``storage_dir`` (``..``, ``../x``, ``..\\\\x`` on Windows). Returns
            the relative subdirectory string ('' when no user_path).
            """
            if not user_path:
                return ''
            if os.path.isabs(user_path):
                raise fastapi.HTTPException(
                    status_code=400, detail=f'invalid user_path: {user_path!r}',
                )
            normalized = os.path.normpath(user_path)
            climbed_out = (
                normalized in ('.', '..')
                or normalized.startswith('../')
                or normalized.startswith('..' + os.sep)
                or os.path.isabs(normalized)
            )
            if climbed_out:
                raise fastapi.HTTPException(
                    status_code=400, detail=f'invalid user_path: {user_path!r}',
                )
            sub_dir = normalized.strip('/').strip(os.sep)
            if not sub_dir:
                raise fastapi.HTTPException(
                    status_code=400, detail=f'invalid user_path: {user_path!r}',
                )
            return sub_dir

        async def _legacy_upload(
            self,
            files: List['fastapi.UploadFile'],
            override: bool,
            metadatas: Optional[str],
            group_name: Optional[str],
            user_path: Optional[str],
            *,
            response_shape: str = 'ids_and_results',  # 'ids_and_results' or 'ids_only'
        ):
            """Shared implementation for legacy DocManager-style upload endpoints.

            Kept so DocWebModule and external callers from before the doc_service
            refactor (which used /upload_files and /add_files_to_group on the old
            ``ServerModule(DocManager(...))``) keep working against the new
            DocServer. New callers should target the /v1/docs/* surface instead.

            Compatibility behaviors preserved here:
            - ``override=True`` writes files at deterministic paths
              (``storage_dir/<user_path>/<filename>``) so DocManager.upload
              derives the same ``doc_id`` and reparses the existing document
              rather than creating a duplicate.
            - ``user_path`` namespaces uploads under a subdirectory, so two
              callers can post the same filename without colliding.
            - ``algo_id`` mirrors the kb-binding convention used by
              ``DocServer._Impl.ensure_kb_registered`` (``algo_id == kb_id``)
              so non-default groups don't get rejected by the algorithm
              validator.
            - ``metadatas`` rejects reserved internal keys (``docid``,
              ``doc_id``, ``lazyllm_doc_path``) instead of silently shadowing
              them downstream, and 400s on length / element-type mismatch
              instead of mis-attaching metadata or 500-ing inside AddFileItem.
            - Response body matches the legacy shape: ``data=[ids, results]``
              for ``/upload_files``; flat ``data=ids`` for ``/add_files_to_group``.
              ``results`` propagates per-item ``error_code`` (or ``'ok'``) from
              ``DocManager.upload``, so synchronous failures (e.g. parser
              outage -> ``PARSER_SUBMIT_FAILED``) don't masquerade as success.

            Override→reparse routing: when ``override=True`` and a doc already
            exists at the destination path in the kb (excluding FAILED /
            CANCELED, which still take the upload-retry path so caller
            metadata is applied), the shim sends those items through
            ``DocManager.reparse`` instead of ``upload``, so the legacy
            "replace + reparse" workflow doesn't get rejected by the new
            ``_assert_action_allowed(..., 'upload')`` 409 on SUCCESS docs.
            New paths in the same request still flow through ``upload``.
            Caller-supplied metadata for existing docs is merged directly
            into the documents row before reparse via
            ``_legacy_apply_metadata_to_existing_docs`` -- this avoids the
            ``patch_metadata``→reparse race that would orphan a
            DOC_UPDATE_META task and intermittently 409 the reparse.

            Known compat gaps tracked in #1090 (not exercised by the migrated
            tests so the PR ships as-is; future PRs should harden these):
            - The new file bytes are written before the doc-service validation
              runs; a 4xx from ``upload()``/``reparse()`` after override leaves
              the new bytes on disk while DB attrs still describe the old
              file. New code should prefer the staged ``/v1/docs/upload``.
            - The override+metadata write happens before reparse validation,
              so a reparse rejection (e.g. WORKING/DELETING state) commits
              the metadata change without rolling back. Mitigated for
              FAILED/CANCELED docs by routing them through upload instead.
            - The override+reparse path doesn't refresh ``content_hash`` /
              ``size_bytes`` / ``file_type`` for the replaced file —
              ``list_docs`` / ``get_doc_detail`` will continue showing the
              old file's attrs until a separate metadata patch lands.
            - Validation errors raise ``HTTPException`` and surface FastAPI's
              ``{"detail": ...}`` body, not the standard ``{code,msg,data}``
              envelope.
            """
            self._lazy_init()
            if not files:
                raise fastapi.HTTPException(status_code=400, detail='files is required')
            kb_id = group_name or '__default__'
            # Match Document._Manager / ensure_kb_registered: each kb is bound
            # to an algorithm of the same name. Hard-coding '__default__' here
            # would 400 every non-default-group upload at validation time.
            algo_id = kb_id
            parsed_metadatas = self._parse_legacy_metadatas(metadatas, len(files))
            sub_dir = self._normalize_legacy_user_path(user_path)
            saved_paths, file_identities = await self._persist_uploads(
                files, override=override, sub_dir=sub_dir,
            )
            return self._run(lambda: self._legacy_dispatch_uploads(
                saved_paths=saved_paths,
                file_identities=file_identities,
                metadatas=parsed_metadatas,
                kb_id=kb_id,
                algo_id=algo_id,
                override=override,
                response_shape=response_shape,
            ))

        def _legacy_apply_metadata_to_existing_docs(self, doc_id_meta_pairs):
            """Synchronously merge new metadata into the documents table for the
            given (doc_id, metadata) pairs. Used by the legacy override path
            instead of ``DocManager.patch_metadata`` so we don't enqueue a
            DOC_UPDATE_META task that would race the subsequent reparse.

            We MERGE rather than replace, matching the legacy "metadatas
            updates the named keys" semantic — callers that send a partial
            metadata dict shouldn't lose existing keys.
            """
            if not doc_id_meta_pairs:
                return
            db = self._manager._db_manager
            Doc = db.get_table_orm_class(DOCUMENTS_TABLE_INFO['name'])
            with db.get_session() as session:
                for doc_id, patch in doc_id_meta_pairs:
                    row = session.query(Doc).filter(Doc.doc_id == doc_id).first()
                    if row is None:
                        continue
                    existing = from_json(row.meta) if row.meta else {}
                    existing.update(patch)
                    row.meta = to_json(existing)
                    row.updated_at = datetime.now()
                    session.add(row)

        def _legacy_dispatch_uploads(
            self, *, saved_paths, file_identities, metadatas, kb_id, algo_id,
            override, response_shape,
        ):
            """Split saved files into "new doc -> upload" and (override only)
            "existing doc -> reparse + metadata patch", then merge the per-item
            results in input order so the legacy response shape stays correct.

            ``DocManager.upload`` rejects an already-SUCCESS doc with 409 (via
            ``_assert_action_allowed``), so a re-upload of the same file path
            with ``override=True`` would otherwise break the legacy
            "replace + reparse" workflow. We look up existing doc_ids by path
            and route those through ``DocManager.reparse``, then ``patch_metadata``
            for any updated metadata payload.
            """
            # Pair each saved path with its metadata + the file_identity used
            # to build the idempotency payload.
            inputs = list(zip(saved_paths, metadatas + [{}] * (len(saved_paths) - len(metadatas)), file_identities))
            # ``exclude_failed=True`` keeps FAILED/CANCELED docs OUT of the
            # reparse pool: ``_assert_action_allowed(..., 'upload')`` already
            # accepts those states, and the upload path applies the caller's
            # fresh ``metadatas``. Reparse instead reloads metadata from the
            # existing doc row, so retrying a failed upload with new tags
            # via reparse would silently lose those tags.
            existing_by_path = (
                self._manager._list_kb_docs_by_path(kb_id, exclude_failed=True)
                if override else {}
            )
            new_inputs = [(p, m, fi) for p, m, fi in inputs if p not in existing_by_path]
            reparse_inputs = [(p, m, fi, existing_by_path[p]) for p, m, fi in inputs if p in existing_by_path]

            result_by_path = {}
            # Run reparse FIRST so its validation (``_prepare_reparse_items``
            # → ``_assert_action_allowed``) raises before any new-file upload
            # has been enqueued. If we ran upload first and then reparse
            # raised on a WORKING/DELETING existing doc, the new-file uploads
            # would already be committed -- a partial-commit failure mode.
            if reparse_inputs:
                # Apply the caller's metadata directly to the documents row
                # so the reparse worker (which reloads ``doc.meta`` in
                # ``_prepare_reparse_items``) picks up the new values.
                # We deliberately do NOT call ``DocManager.patch_metadata``
                # here: it enqueues a separate DOC_UPDATE_META task that
                # races ``reparse`` -- on a fast parser the metadata task's
                # START callback flips the doc to WORKING and the subsequent
                # ``reparse()`` 409s via ``_assert_action_allowed``, and
                # even when the race doesn't fire the metadata task is
                # orphaned in WAITING because ``reparse`` overwrites the
                # snapshot's ``current_task_id``. A direct row update
                # avoids both pitfalls while preserving the legacy
                # "overwrite refreshes tags" semantic.
                self._legacy_apply_metadata_to_existing_docs(
                    [(eid, m) for _, m, _, eid in reparse_inputs if m]
                )
                reparse_request = ReparseRequest(
                    doc_ids=[eid for _, _, _, eid in reparse_inputs],
                    kb_id=kb_id, algo_id=algo_id,
                )
                task_ids = self._manager.reparse(reparse_request)
                for (p, _, _, eid), task_id in zip(reparse_inputs, task_ids):
                    result_by_path[p] = {
                        'doc_id': eid, 'task_id': task_id,
                        'accepted': True, 'error_code': None,
                    }

            if new_inputs:
                upload_request = UploadRequest(
                    items=[AddFileItem(file_path=p, metadata=m) for p, m, _ in new_inputs],
                    kb_id=kb_id, algo_id=algo_id, source_type=SourceType.API,
                )
                upload_payload = self._build_upload_payload(
                    upload_request, [fi for _, _, fi in new_inputs],
                )
                upload_result = self._manager.run_idempotent(
                    '/v1/docs/upload', upload_request.idempotency_key, upload_payload,
                    lambda: self._manager.upload(upload_request),
                )
                if isinstance(upload_result, dict) and 'items' in upload_result:
                    upload_items = upload_result['items']
                else:
                    upload_items = upload_result or []
                for (p, _, _), item in zip(new_inputs, upload_items):
                    result_by_path[p] = item

            ordered = [result_by_path.get(p, {'doc_id': None, 'accepted': False,
                                              'error_code': 'MISSING'})
                       for p in saved_paths]
            doc_ids = [it.get('doc_id') for it in ordered]
            if response_shape == 'ids_and_results':
                # Propagate per-item status from DocManager.upload so callers
                # of /upload_files see synchronous failures (e.g. parser outage
                # -> accepted=False / error_code=PARSER_SUBMIT_FAILED) instead
                # of a misleading 'ok' for every file.
                results = [
                    'ok' if it.get('accepted', True) else (
                        it.get('error_code') or it.get('error_msg') or 'failed'
                    )
                    for it in ordered
                ]
                return [doc_ids, results]
            return doc_ids

        @app.post('/upload_files')
        async def upload_files_legacy(
            self,
            files: List['fastapi.UploadFile'] = fastapi.File(...),  # noqa: B008
            # Match legacy DocManager default: ``False`` keeps the unique-path
            # fallback so a re-upload without ``?override=true`` does not
            # silently replace an existing document.
            override: bool = fastapi.Query(False),  # noqa: B008
            metadatas: Optional[str] = fastapi.Query(None),  # noqa: B008
            group_name: Optional[str] = fastapi.Query(None),  # noqa: B008
            user_path: Optional[str] = fastapi.Query(None),  # noqa: B008
        ):
            return await self._legacy_upload(
                files, override, metadatas, group_name, user_path,
                response_shape='ids_and_results',
            )

        @app.post('/add_files_to_group')
        async def add_files_to_group_legacy(
            self,
            files: List['fastapi.UploadFile'] = fastapi.File(...),  # noqa: B008
            group_name: str = fastapi.Query(...),  # noqa: B008
            # Same legacy default as /upload_files; explicit opt-in required
            # to overwrite.
            override: bool = fastapi.Query(False),  # noqa: B008
            metadatas: Optional[str] = fastapi.Query(None),  # noqa: B008
            user_path: Optional[str] = fastapi.Query(None),  # noqa: B008
        ):
            return await self._legacy_upload(
                files, override, metadatas, group_name, user_path,
                response_shape='ids_only',
            )

        @app.post('/v1/docs/upload')
        async def upload(
            self,
            files: List['fastapi.UploadFile'] = fastapi.File(...),  # noqa: B008
            kb_id: Optional[str] = fastapi.Form(None),  # noqa: B008
            algo_id: Optional[str] = fastapi.Form(None),  # noqa: B008
            source_type: Optional[SourceType] = fastapi.Form(None),  # noqa: B008
            doc_id: Optional[str] = fastapi.Form(None),  # noqa: B008
            idempotency_key: Optional[str] = fastapi.Form(None),  # noqa: B008
        ):
            self._lazy_init()
            if not files:
                raise fastapi.HTTPException(status_code=400, detail='files is required')
            kb_id = kb_id or '__default__'
            if algo_id is not None:
                LOG.warning(f'[upload] algo_id is deprecated and ignored; kb={kb_id}')
            source_type = source_type or SourceType.API
            saved_paths, file_identities = await self._persist_uploads(files)
            upload_request = UploadRequest(
                items=[
                    AddFileItem(file_path=path, doc_id=(doc_id if idx == 0 else None))
                    for idx, path in enumerate(saved_paths)
                ],
                kb_id=kb_id,
                source_type=source_type,
                idempotency_key=idempotency_key,
            )
            if file_identities:
                file_identities[0]['doc_id'] = doc_id
            return self._run_upload(upload_request, self._build_upload_payload(upload_request, file_identities))

        @app.post('/v1/docs/add')
        def add(self, request: AddRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/add', request.idempotency_key, payload, lambda: {'items': self._manager.add_files(request)}
            ))

        @app.post('/v1/docs/reparse')
        def reparse(self, request: ReparseRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/reparse', request.idempotency_key, payload,
                lambda: {'task_ids': self._manager.reparse(request)}
            ))

        @app.post('/v1/docs/delete')
        def delete(self, request: DeleteRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/delete', request.idempotency_key, payload, lambda: {'items': self._manager.delete(request)}
            ))

        @app.post('/v1/docs/transfer')
        def transfer(self, request: TransferRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/transfer', request.idempotency_key, payload,
                lambda: {'items': self._manager.transfer(request)}
            ))

        @app.get('/v1/docs')
        def list_docs(
            self,
            status: Optional[List[str]] = None,
            kb_id: Optional[str] = None,
            algo_id: Optional[str] = None,
            keyword: Optional[str] = None,
            include_deleted_or_canceled: bool = True,
            page: int = 1,
            page_size: int = 20,
        ):
            self._lazy_init()
            if algo_id is not None:
                LOG.warning(f'[list_docs] algo_id={algo_id!r} is deprecated and ignored')
            data = self._manager.list_docs(
                status=status,
                kb_id=kb_id,
                keyword=keyword,
                include_deleted_or_canceled=include_deleted_or_canceled,
                page=page,
                page_size=page_size,
            )
            return BaseResponse(code=200, msg='success', data=data)

        @app.get('/v1/docs/{doc_id}')
        def get_doc(self, doc_id: str):
            self._lazy_init()
            return self._run(lambda: self._manager.get_doc_detail(doc_id))

        @app.post('/v1/docs/metadata/patch')
        def patch_metadata(self, request: MetadataPatchRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/docs/metadata/patch', request.idempotency_key, payload,
                lambda: self._manager.patch_metadata(request)
            ))

        @app.get('/v1/tasks')
        def list_tasks(self, status: Optional[List[str]] = None, page: int = 1, page_size: int = 20):
            self._lazy_init()
            resp = self._manager.list_tasks(status, page, page_size)
            return self._response(
                data=self._format_task_response_data(resp.data),
                code=resp.code,
                msg=resp.msg,
                status_code=resp.code,
            )

        @app.get('/v1/tasks/{task_id}')
        def get_task(self, task_id: str):
            self._lazy_init()
            resp = self._manager.get_task(task_id)
            return self._response(
                data=self._format_task_response_data(resp.data),
                code=resp.code,
                msg=resp.msg,
                status_code=resp.code,
            )

        def cancel_task_by_id(self, task_id: str):
            self._lazy_init()
            resp = self._manager.cancel_task(task_id)
            return self._response(data=resp.data, code=resp.code, msg=resp.msg, status_code=resp.code)

        @app.post('/v1/tasks/cancel')
        def cancel_task(self, request: TaskCancelRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')

            def _cancel():
                resp = self._manager.cancel_task(request.task_id)
                if resp.code == 404:
                    raise DocServiceError('E_NOT_FOUND', resp.msg, resp.data)
                if resp.code == 409:
                    raise DocServiceError('E_STATE_CONFLICT', resp.msg, resp.data)
                if resp.code != 200:
                    raise DocServiceError('E_INVALID_PARAM', resp.msg, resp.data)
                return resp.data
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/tasks/cancel', request.idempotency_key, payload, _cancel
            ))

        def task_callback(self, callback: Any):
            self._lazy_init()
            return self._run(lambda: self._manager.on_task_callback(self._normalize_task_callback(callback)))

        @app.post('/v1/internal/callbacks/tasks')
        def task_callback_http(self, request: TaskCallbackPayload):
            return self.task_callback(request.model_dump(mode='json', exclude_none=True))

        @app.get('/v1/algo/list')
        def list_algo(self):
            self._lazy_init()
            return self._run(lambda: self._manager.list_algorithms())

        @app.get('/v1/algo/{algo_id}/groups')
        def get_algo_groups(self, algo_id: str):
            self._lazy_init()
            return self._run(lambda: self._manager.get_algo_groups(algo_id))

        @app.get('/v1/algorithms')
        def list_algorithms(self):
            self._lazy_init()
            return self._run(lambda: self._manager.list_algorithms_compat())

        def list_algorithms_impl(self):
            self._lazy_init()
            return self._run(lambda: self._manager.list_algorithms_compat())

        @app.post('/v1/algorithms/info')
        def get_algorithm_info(self, request: AlgorithmInfoRequest):
            self._lazy_init()
            return self._run(lambda: self._manager.get_algorithm_info(request.algo_id))

        def get_algorithm_info_impl(self, algo_id: str):
            self._lazy_init()
            return self._run(lambda: self._manager.get_algorithm_info(algo_id))

        @app.get('/v1/chunks')
        def list_chunks(
            self,
            kb_id: str,
            doc_id: str,
            group: str,
            algo_id: Optional[str] = None,
            page: int = 1,
            page_size: int = 20,
            offset: Optional[int] = None,
        ):
            self._lazy_init()
            return self._run(lambda: self._manager.list_chunks(
                kb_id=kb_id, doc_id=doc_id, group=group, algo_id=algo_id,
                page=page, page_size=page_size, offset=offset,
            ))

        @app.post('/v1/tasks/batch')
        def get_tasks_batch(self, request: TaskBatchRequest):
            self._lazy_init()
            return self._run(lambda: self._manager.get_tasks_batch(request.task_ids))

        def get_tasks_batch_impl(self, task_ids: List[str]):
            self._lazy_init()
            return self._run(lambda: self._manager.get_tasks_batch(task_ids))

        @app.post('/v1/tasks/info')
        def get_task_info(self, request: TaskInfoRequest):
            self._lazy_init()
            resp = self._manager.get_task(request.task_id)
            return self._response(
                data=self._format_task_response_data(resp.data),
                code=resp.code,
                msg=resp.msg,
                status_code=resp.code,
            )

        def get_task_info_impl(self, task_id: str):
            self._lazy_init()
            resp = self._manager.get_task(task_id)
            return self._response(
                data=self._format_task_response_data(resp.data),
                code=resp.code,
                msg=resp.msg,
                status_code=resp.code,
            )

        @app.get('/v1/kbs')
        def list_kbs(
            self,
            page: int = 1,
            page_size: int = 20,
            keyword: Optional[str] = None,
            status: Optional[List[str]] = None,
            owner_id: Optional[str] = None,
        ):
            self._lazy_init()
            return self._run(lambda: self._manager.list_kbs(
                page=page,
                page_size=page_size,
                keyword=keyword,
                status=status,
                owner_id=owner_id,
            ))

        @app.get('/v1/kbs/{kb_id}')
        def get_kb(self, kb_id: str):
            self._lazy_init()
            return self._run(lambda: self._manager.get_kb(kb_id))

        def create_kb_by_id(self, kb_id: str, display_name: Optional[str] = None, description: Optional[str] = None,
                            owner_id: Optional[str] = None, meta: Optional[Dict[str, Any]] = None,
                            algo_id: str = '__default__'):
            self._lazy_init()
            return self._run(lambda: self._manager.create_kb(
                kb_id,
                display_name=display_name,
                description=description,
                owner_id=owner_id,
                meta=meta,
                algo_id=algo_id,
            ))

        @app.post('/v1/kbs')
        def create_kb(self, request: KbCreateRequest):
            self._lazy_init()
            if not request.kb_id:
                raise DocServiceError('E_INVALID_PARAM', 'kb_id is required')
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/kbs', request.idempotency_key, payload,
                lambda: self._manager.create_kb(
                    request.kb_id,
                    display_name=request.display_name,
                    description=request.description,
                    owner_id=request.owner_id,
                    meta=request.meta,
                    algo_id=request.algo_id,
                )
            ))

        def update_kb_by_id(self, kb_id: str, request: KbUpdateRequest):
            self._lazy_init()
            if request.kb_id and request.kb_id != kb_id:
                raise DocServiceError(
                    'E_INVALID_PARAM',
                    f'kb_id mismatch: path={kb_id}, body={request.kb_id}',
                    {'kb_id': kb_id, 'request_kb_id': request.kb_id},
                )
            payload = self._build_update_kb_payload(kb_id, request)
            return self._run(lambda: self._manager.run_idempotent(
                f'/v1/kbs/{kb_id}:patch', request.idempotency_key, payload,
                lambda: self._manager.update_kb(
                    kb_id,
                    display_name=request.display_name,
                    description=request.description,
                    owner_id=request.owner_id,
                    meta=request.meta,
                    algo_id=request.algo_id,
                    explicit_fields=set(request.model_fields_set),
                )
            ))

        @app.post('/v1/kbs/{kb_id}/update')
        def update_kb(self, kb_id: str, request: KbUpdateRequest):
            return self.update_kb_by_id(kb_id, request)

        @app.delete('/v1/kbs/{kb_id}/algos/{algo_id}')
        def unbind_algo(self, kb_id: str, algo_id: str, dry_run: bool = False):
            self._lazy_init()
            return self._run(lambda: self._manager.unbind_algo(kb_id, algo_id, dry_run=dry_run))

        @app.delete('/v1/kbs/{kb_id}')
        def delete_kb(self, kb_id: str, idempotency_key: Optional[str] = None):
            self._lazy_init()
            payload = {'kb_id': kb_id}
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/kbs/{kb_id}:delete', idempotency_key, payload, lambda: self._manager.delete_kb(kb_id)
            ))

        @app.delete('/v1/kbs')
        def delete_kbs(self, request: KbDeleteBatchRequest):
            self._lazy_init()
            payload = request.model_dump(mode='json')
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/kbs:delete', request.idempotency_key, payload, lambda: self._manager.delete_kbs(request.kb_ids)
            ))

        def delete_kbs_impl(self, kb_ids: List[str], idempotency_key: Optional[str] = None):
            self._lazy_init()
            payload = {'kb_ids': kb_ids}
            return self._run(lambda: self._manager.run_idempotent(
                '/v1/kbs:delete', idempotency_key, payload, lambda: self._manager.delete_kbs(kb_ids)
            ))

        @app.post('/v1/ng/{group_name}/lazy_mode')
        def set_node_group_lazy_mode(self, group_name: str,
                                     lazy_mode: Optional[str] = None):
            self._lazy_init()
            return self._run(lambda: self._manager.set_node_group_lazy_mode(group_name, lazy_mode))

        @app.get('/v1/health')
        def health(self):
            self._lazy_init()
            return BaseResponse(code=200, msg='success', data=self._manager.health())

        @app.get('/v1/internal/parser-url')
        def get_parser_url(self):
            self._lazy_init()
            return BaseResponse(code=200, msg='success', data={'parser_url': self._parser_url})

        def __call__(self, func_name: str, *args, **kwargs):
            return getattr(self, func_name)(*args, **kwargs)

    def __init__(
        self,
        port: Optional[int] = None,
        url: Optional[str] = None,
        parser_url: Optional[str] = None,
        db_config: Optional[Dict[str, Any]] = None,
        parser_db_config: Optional[Dict[str, Any]] = None,
        parser_poll_interval: float = 0.05,
        storage_dir: Optional[str] = None,
        callback_url: Optional[str] = None,
        pythonpath: Optional[str] = None,
        launcher=None,
        enable_scan: bool = False,
        scan_interval: int = 10,
    ):
        super().__init__()
        self._raw_impl = None
        self._storage_dir = storage_dir or os.path.join(os.getcwd(), '.doc_service_uploads')
        self._db_config = db_config or _get_default_db_config('doc_service')
        self._parser_db_config = parser_db_config or _get_default_db_config('doc_service_parser')
        if url:
            self._impl = UrlModule(url=ensure_call_endpoint(url))
        else:
            if not parser_url:
                raise ValueError('parser_url is required; doc_service no longer embeds a mock parsing server')
            self._raw_impl = DocServer._Impl(
                storage_dir=self._storage_dir,
                db_config=self._db_config,
                parser_db_config=self._parser_db_config,
                parser_poll_interval=parser_poll_interval,
                parser_url=parser_url,
                callback_url=callback_url,
                enable_scan=enable_scan,
                scan_interval=scan_interval,
            )
            # DocServer is a lightweight HTTP front-end for doc CRUD; never needs
            # GPU. Default to EmptyLauncher when the caller did not pass one so we
            # don't inherit LAZYLLM_DEFAULT_LAUNCHER (e.g. 'sco' in CI) and try
            # to submit srun jobs for what should be a local Python subprocess.
            import lazyllm as _lazyllm
            effective_launcher = launcher if launcher is not None else _lazyllm.launchers.empty(sync=False)
            self._impl = ServerModule(
                self._raw_impl, port=port, launcher=effective_launcher, pythonpath=pythonpath,
            )

    @staticmethod
    def _register_openapi_routes(openapi_app: 'fastapi.FastAPI', impl: 'DocServer._Impl'):
        def _find_services(cls):
            if '__relay_services__' not in dir(cls):
                return
            if '__relay_services__' in cls.__dict__:
                for (method, path), (name, kw) in cls.__relay_services__.items():
                    if getattr(impl.__class__, name) is getattr(cls, name):
                        route_method = getattr(openapi_app, 'get' if method == 'list' else method)
                        route_method(path, **kw)(getattr(impl, name))
            for base in cls.__bases__:
                _find_services(base)

        app.update()
        _find_services(impl.__class__)

    @classmethod
    def build_openapi_app(cls, title: str = 'LazyLLM DocService API', version: str = '1.0.0'):
        """Build the FastAPI application used for OpenAPI export."""
        openapi_app = fastapi.FastAPI(
            title=title,
            version=version,
            description='OpenAPI schema generated from current DocServer routes.',
        )
        impl = cls._Impl(
            storage_dir=os.path.join(os.getcwd(), '.doc_service_openapi'),
            parser_url='http://127.0.0.1:9966',
        )
        cls._register_openapi_routes(openapi_app, impl)
        for route in openapi_app.routes:
            body_field = getattr(route, 'body_field', None)
            annotation = getattr(getattr(body_field, 'field_info', None), 'annotation', None)
            if hasattr(annotation, 'model_rebuild'):
                annotation.model_rebuild(force=True, _types_namespace=route.endpoint.__globals__)
        return openapi_app

    @classmethod
    def build_openapi_schema(cls, title: str = 'LazyLLM DocService API', version: str = '1.0.0'):
        """Generate the OpenAPI schema for doc_service."""
        return cls.build_openapi_app(title=title, version=version).openapi()

    @classmethod
    def export_openapi(
        cls,
        output_path: str = DEFAULT_OPENAPI_OUTPUT_PATH,
        title: str = 'LazyLLM DocService API',
        version: str = '1.0.0',
    ):
        """Export the doc_service OpenAPI schema to a file."""
        schema = cls.build_openapi_schema(title=title, version=version)
        output_dir = os.path.dirname(output_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as fh:
            json.dump(schema, fh, ensure_ascii=False, indent=2, sort_keys=True)
        return output_path

    def start(self):
        result = super().start()
        if self._raw_impl and isinstance(self._impl, ServerModule):
            try:
                callback_url = self._impl._url.rsplit('/', 1)[0] + '/v1/internal/callbacks/tasks'
                self._dispatch('set_runtime_callback_url', callback_url)
            except Exception as exc:
                LOG.warning(f'[DocServer] failed to set runtime callback url: {exc}')
        return result

    def stop(self):
        if self._raw_impl:
            try:
                self._dispatch('stop')
            except Exception as exc:
                LOG.warning(f'[DocServer] stop impl failed: {exc}, {traceback.format_exc()}')
        if isinstance(self._impl, ServerModule):
            self._impl.stop()

    @property
    def url(self):
        return self._impl._url

    @property
    def _url(self):
        return self.url

    @property
    def parser_url(self):
        if self._raw_impl:
            return self._raw_impl._parser_url
        base_url = self.url.rsplit('/', 1)[0]
        try:
            response = requests.get(f'{base_url}/v1/internal/parser-url', timeout=5)
            response.raise_for_status()
            return response.json()['data']['parser_url']
        except (requests.RequestException, KeyError, TypeError, ValueError) as exc:
            LOG.warning(f'[DocServer] failed to resolve remote parser_url from {base_url}: {exc}')
            return None

    @staticmethod
    def _normalize_dispatch_result(result):
        if isinstance(result, fastapi.responses.JSONResponse):
            return json.loads(result.body.decode())
        return result

    def _dispatch(self, method: str, *args, **kwargs):
        if isinstance(self._impl, ServerModule):
            return self._normalize_dispatch_result(self._impl._call(method, *args, **kwargs))
        return self._normalize_dispatch_result(getattr(self._impl, method)(*args, **kwargs))

    # Method-call style wrappers
    def upload(self, request: UploadRequest):
        """Upload files into DocServer-managed storage through the ``/v1/docs/upload`` flow.

Use this method when you want DocServer to manage uploaded copies of the source files. The request body is an
``UploadRequest`` with ``kb_id``, ``algo_id``, and ``items``. Each item uses ``file_path`` as the local source
path and can optionally include ``doc_id`` or ``metadata``.

**Returns:**
    Standard API response. ``data["items"]`` contains the accepted ``doc_id`` and asynchronous ``task_id``.
"""
        return self._dispatch('upload_request', request)

    def add(self, request: AddRequest):
        """Add existing local files through the ``/v1/docs/add`` endpoint.

Use this method when the file paths are already accessible on the DocServer host. The request body is an
``AddRequest`` containing ``kb_id``, ``algo_id``, and ``items``. Each item can provide ``file_path``,
optional ``doc_id``, and optional ``metadata``.

**Returns:**
    Standard API response. ``data["items"]`` contains the accepted ``doc_id`` and asynchronous ``task_id``.
"""
        return self._dispatch('add', request)

    def reparse(self, request: ReparseRequest):
        """Reparse existing documents through the ``/v1/docs/reparse`` endpoint.

The request body is a ``ReparseRequest`` with ``kb_id`` and ``doc_ids``. Use it after metadata
or parsing configuration changes when you want to enqueue new parse tasks for existing documents.

Specify either ``algo_id`` to reparse all node groups of that algorithm, or ``reparse_group``
(a node-group name) to reparse a single group. The two fields are mutually exclusive — providing
both raises a validation error. When neither is provided the first algorithm bound to the
knowledge base is used and all its node groups are reparsed.
"""
        return self._dispatch('reparse', request)

    def delete(self, request: DeleteRequest):
        """Delete documents from a knowledge base through the ``/v1/docs/delete`` endpoint.

The request body is a ``DeleteRequest`` with ``kb_id`` and ``doc_ids``. Deletion is asynchronous,
so the returned ``task_id`` should be tracked through the task APIs when you need final status.

All algorithms bound to the knowledge base are handled automatically — there is no need to
specify an ``algo_id``. If any algorithm's parse task is in WORKING state the request is
rejected with ``E_STATE_CONFLICT``. WAITING add-tasks are cancelled before the delete proceeds.
"""
        return self._dispatch('delete', request)

    def transfer(self, request: TransferRequest):
        """Transfer parsed documents between knowledge bases under the same algorithm.

The request body is a ``TransferRequest``. Each transfer item must provide a unique ``target_doc_id`` in the target
knowledge base. Transfer across different algorithms is not supported. Optional ``target_filename`` and
``target_file_path`` can override the destination file name/path recorded for the transferred document.
"""
        return self._dispatch('transfer', request)

    def patch_metadata(self, request: MetadataPatchRequest):
        """Patch document metadata through the ``/v1/docs/metadata/patch`` endpoint.

The request body is a ``MetadataPatchRequest`` with ``kb_id``, ``algo_id``, and ``items``. Each item targets one
document and carries a partial metadata patch in ``patch``.
"""
        return self._dispatch('patch_metadata', request)

    def list_docs(self, **kwargs):
        """List documents in a knowledge base with pagination."""
        return self._dispatch('list_docs', **kwargs)

    def get_doc(self, doc_id: str):
        """Get detailed information for one document."""
        return self._dispatch('get_doc', doc_id)

    def list_tasks(self, **kwargs):
        """List task records with pagination."""
        return self._dispatch('list_tasks', **kwargs)

    def get_tasks_batch(self, task_ids: List[str]):
        """Fetch multiple task records in one batch."""
        return self._dispatch('get_tasks_batch_impl', task_ids)

    def get_task_info(self, task_id: str):
        """Get one task record."""
        return self._dispatch('get_task_info_impl', task_id)

    def get_task(self, task_id: str):
        """Get one task record through the ``/v1/tasks/{task_id}`` endpoint.

Args:
    task_id (str): Task ID returned by add, upload, reparse, delete, transfer, or metadata patch operations.

**Returns:**
    Standard API response with the current task status and task payload.
"""
        return self._dispatch('get_task', task_id)

    def set_runtime_callback_url(self, callback_url: str):
        """Update the runtime task callback URL."""
        return self._dispatch('set_runtime_callback_url', callback_url)

    def cancel_task(self, task_id: str):
        """Cancel a waiting task through the ``/v1/tasks/cancel`` endpoint.

Args:
    task_id (str): Task ID to cancel.

**Returns:**
    Standard API response indicating whether the task was canceled successfully.
"""
        return self._dispatch('cancel_task_by_id', task_id)

    def list_kbs(self, **kwargs):
        """List knowledge bases with pagination."""
        return self._dispatch('list_kbs', **kwargs)

    def get_kb(self, kb_id: str):
        """Get information for one knowledge base."""
        return self._dispatch('get_kb', kb_id)

    def list_chunks(self, **kwargs):
        """List parsed chunks for a document through the ``/v1/chunks`` endpoint.

Args:
    kb_id (str): Knowledge-base ID.
    doc_id (str): Source document ID.
    group (str): Node group name to inspect.
    algo_id (str): Algorithm ID.
    page (int): 1-based page number.
    page_size (int): Number of chunks per page.
    offset (Optional[int]): Explicit offset. When omitted, the service derives it from ``page`` and ``page_size``.

Returns:
    Paginated chunk data including ``items`` and ``total``.
"""
        return self._dispatch('list_chunks', **kwargs)

    def list_algorithms(self):
        """List available algorithms."""
        return self._dispatch('list_algorithms_impl')

    def get_algorithm_info(self, algo_id: str):
        """Get detailed information for one algorithm."""
        return self._dispatch('get_algorithm_info_impl', algo_id)

    def create_kb(self, kb_id: str, display_name: Optional[str] = None, description: Optional[str] = None,
                  owner_id: Optional[str] = None, meta: Optional[Dict[str, Any]] = None,
                  algo_id: str = '__default__'):
        """Create a new knowledge base."""
        return self._dispatch('create_kb_by_id', kb_id, display_name, description, owner_id, meta, algo_id)

    def update_kb(self, kb_id: str, request: KbUpdateRequest):
        """Update knowledge-base metadata."""
        return self._dispatch('update_kb_by_id', kb_id, request)

    def delete_kb(self, kb_id: str):
        """Delete one knowledge base."""
        return self._dispatch('delete_kb', kb_id)

    def delete_kbs(self, kb_ids: List[str]):
        """Delete multiple knowledge bases in one batch."""
        return self._dispatch('delete_kbs_impl', kb_ids)

    def unbind_algo(self, kb_id: str, algo_id: str):
        """Unbind one algorithm from a knowledge base and asynchronously clean up parse data for node groups exclusive to that algorithm; node groups shared with other algorithms are preserved."""
        return self._dispatch('unbind_algo', kb_id, algo_id)

    def ensure_kb_registered(self, kb_id: str, algo_id: Optional[str] = None):
        """Ensure the knowledge base row and algorithm binding exist in the doc service."""
        return self._dispatch('ensure_kb_registered', kb_id, algo_id)

    def enable_scanning(self):
        """Trigger dataset scanning for a local doc service after registrations are ready."""
        return self._dispatch('enable_scanning')

    def set_node_group_lazy_mode(self, group_name: str, lazy_mode: Optional[str] = None):
        """Set the lazy-loading mode for a specific node group."""
        return self._dispatch('set_node_group_lazy_mode', group_name, lazy_mode)

`add(request)`

Add existing local files through the /v1/docs/add endpoint.

Use this method when the file paths are already accessible on the DocServer host. The request body is an AddRequest containing kb_id, algo_id, and items. Each item can provide file_path, optional doc_id, and optional metadata.

Returns: Standard API response. data["items"] contains the accepted doc_id and asynchronous task_id.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def add(self, request: AddRequest):
        """Add existing local files through the ``/v1/docs/add`` endpoint.

Use this method when the file paths are already accessible on the DocServer host. The request body is an
``AddRequest`` containing ``kb_id``, ``algo_id``, and ``items``. Each item can provide ``file_path``,
optional ``doc_id``, and optional ``metadata``.

**Returns:**
    Standard API response. ``data["items"]`` contains the accepted ``doc_id`` and asynchronous ``task_id``.
"""
        return self._dispatch('add', request)

`build_openapi_app(title='LazyLLM DocService API', version='1.0.0')` `classmethod`

Build the FastAPI application used for OpenAPI export.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

@classmethod
def build_openapi_app(cls, title: str = 'LazyLLM DocService API', version: str = '1.0.0'):
    """Build the FastAPI application used for OpenAPI export."""
    openapi_app = fastapi.FastAPI(
        title=title,
        version=version,
        description='OpenAPI schema generated from current DocServer routes.',
    )
    impl = cls._Impl(
        storage_dir=os.path.join(os.getcwd(), '.doc_service_openapi'),
        parser_url='http://127.0.0.1:9966',
    )
    cls._register_openapi_routes(openapi_app, impl)
    for route in openapi_app.routes:
        body_field = getattr(route, 'body_field', None)
        annotation = getattr(getattr(body_field, 'field_info', None), 'annotation', None)
        if hasattr(annotation, 'model_rebuild'):
            annotation.model_rebuild(force=True, _types_namespace=route.endpoint.__globals__)
    return openapi_app

`build_openapi_schema(title='LazyLLM DocService API', version='1.0.0')` `classmethod`

Generate the OpenAPI schema for doc_service.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

@classmethod
def build_openapi_schema(cls, title: str = 'LazyLLM DocService API', version: str = '1.0.0'):
    """Generate the OpenAPI schema for doc_service."""
    return cls.build_openapi_app(title=title, version=version).openapi()

`cancel_task(task_id)`

Cancel a waiting task through the /v1/tasks/cancel endpoint.

Parameters:

task_id (str) –

Task ID to cancel.

Returns: Standard API response indicating whether the task was canceled successfully.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def cancel_task(self, task_id: str):
        """Cancel a waiting task through the ``/v1/tasks/cancel`` endpoint.

Args:
    task_id (str): Task ID to cancel.

**Returns:**
    Standard API response indicating whether the task was canceled successfully.
"""
        return self._dispatch('cancel_task_by_id', task_id)

`create_kb(kb_id, display_name=None, description=None, owner_id=None, meta=None, algo_id='default')`

Create a new knowledge base.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def create_kb(self, kb_id: str, display_name: Optional[str] = None, description: Optional[str] = None,
              owner_id: Optional[str] = None, meta: Optional[Dict[str, Any]] = None,
              algo_id: str = '__default__'):
    """Create a new knowledge base."""
    return self._dispatch('create_kb_by_id', kb_id, display_name, description, owner_id, meta, algo_id)

`delete(request)`

Delete documents from a knowledge base through the /v1/docs/delete endpoint.

The request body is a DeleteRequest with kb_id and doc_ids. Deletion is asynchronous, so the returned task_id should be tracked through the task APIs when you need final status.

All algorithms bound to the knowledge base are handled automatically — there is no need to specify an algo_id. If any algorithm's parse task is in WORKING state the request is rejected with E_STATE_CONFLICT. WAITING add-tasks are cancelled before the delete proceeds.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def delete(self, request: DeleteRequest):
        """Delete documents from a knowledge base through the ``/v1/docs/delete`` endpoint.

The request body is a ``DeleteRequest`` with ``kb_id`` and ``doc_ids``. Deletion is asynchronous,
so the returned ``task_id`` should be tracked through the task APIs when you need final status.

All algorithms bound to the knowledge base are handled automatically — there is no need to
specify an ``algo_id``. If any algorithm's parse task is in WORKING state the request is
rejected with ``E_STATE_CONFLICT``. WAITING add-tasks are cancelled before the delete proceeds.
"""
        return self._dispatch('delete', request)

`delete_kb(kb_id)`

Delete one knowledge base.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def delete_kb(self, kb_id: str):
    """Delete one knowledge base."""
    return self._dispatch('delete_kb', kb_id)

`delete_kbs(kb_ids)`

Delete multiple knowledge bases in one batch.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def delete_kbs(self, kb_ids: List[str]):
    """Delete multiple knowledge bases in one batch."""
    return self._dispatch('delete_kbs_impl', kb_ids)

`enable_scanning()`

Trigger dataset scanning for a local doc service after registrations are ready.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def enable_scanning(self):
    """Trigger dataset scanning for a local doc service after registrations are ready."""
    return self._dispatch('enable_scanning')

`ensure_kb_registered(kb_id, algo_id=None)`

Ensure the knowledge base row and algorithm binding exist in the doc service.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def ensure_kb_registered(self, kb_id: str, algo_id: Optional[str] = None):
    """Ensure the knowledge base row and algorithm binding exist in the doc service."""
    return self._dispatch('ensure_kb_registered', kb_id, algo_id)

`export_openapi(output_path=DEFAULT_OPENAPI_OUTPUT_PATH, title='LazyLLM DocService API', version='1.0.0')` `classmethod`

Export the doc_service OpenAPI schema to a file.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

@classmethod
def export_openapi(
    cls,
    output_path: str = DEFAULT_OPENAPI_OUTPUT_PATH,
    title: str = 'LazyLLM DocService API',
    version: str = '1.0.0',
):
    """Export the doc_service OpenAPI schema to a file."""
    schema = cls.build_openapi_schema(title=title, version=version)
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as fh:
        json.dump(schema, fh, ensure_ascii=False, indent=2, sort_keys=True)
    return output_path

`get_algorithm_info(algo_id)`

Get detailed information for one algorithm.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def get_algorithm_info(self, algo_id: str):
    """Get detailed information for one algorithm."""
    return self._dispatch('get_algorithm_info_impl', algo_id)

`get_doc(doc_id)`

Get detailed information for one document.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def get_doc(self, doc_id: str):
    """Get detailed information for one document."""
    return self._dispatch('get_doc', doc_id)

`get_kb(kb_id)`

Get information for one knowledge base.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def get_kb(self, kb_id: str):
    """Get information for one knowledge base."""
    return self._dispatch('get_kb', kb_id)

`get_task(task_id)`

Get one task record through the /v1/tasks/{task_id} endpoint.

Parameters:

task_id (str) –

Task ID returned by add, upload, reparse, delete, transfer, or metadata patch operations.

Returns: Standard API response with the current task status and task payload.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def get_task(self, task_id: str):
        """Get one task record through the ``/v1/tasks/{task_id}`` endpoint.

Args:
    task_id (str): Task ID returned by add, upload, reparse, delete, transfer, or metadata patch operations.

**Returns:**
    Standard API response with the current task status and task payload.
"""
        return self._dispatch('get_task', task_id)

`get_task_info(task_id)`

Get one task record.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def get_task_info(self, task_id: str):
    """Get one task record."""
    return self._dispatch('get_task_info_impl', task_id)

`get_tasks_batch(task_ids)`

Fetch multiple task records in one batch.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def get_tasks_batch(self, task_ids: List[str]):
    """Fetch multiple task records in one batch."""
    return self._dispatch('get_tasks_batch_impl', task_ids)

`list_algorithms()`

List available algorithms.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def list_algorithms(self):
    """List available algorithms."""
    return self._dispatch('list_algorithms_impl')

`list_chunks(**kwargs)`

List parsed chunks for a document through the /v1/chunks endpoint.

Parameters:

kb_id (str) –

Knowledge-base ID.
doc_id (str) –

Source document ID.
group (str) –

Node group name to inspect.
algo_id (str) –

Algorithm ID.
page (int) –

1-based page number.
page_size (int) –

Number of chunks per page.
offset (Optional[int]) –

Explicit offset. When omitted, the service derives it from page and page_size.

Returns:

–

Paginated chunk data including items and total.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def list_chunks(self, **kwargs):
        """List parsed chunks for a document through the ``/v1/chunks`` endpoint.

Args:
    kb_id (str): Knowledge-base ID.
    doc_id (str): Source document ID.
    group (str): Node group name to inspect.
    algo_id (str): Algorithm ID.
    page (int): 1-based page number.
    page_size (int): Number of chunks per page.
    offset (Optional[int]): Explicit offset. When omitted, the service derives it from ``page`` and ``page_size``.

Returns:
    Paginated chunk data including ``items`` and ``total``.
"""
        return self._dispatch('list_chunks', **kwargs)

`list_docs(**kwargs)`

List documents in a knowledge base with pagination.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def list_docs(self, **kwargs):
    """List documents in a knowledge base with pagination."""
    return self._dispatch('list_docs', **kwargs)

`list_kbs(**kwargs)`

List knowledge bases with pagination.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def list_kbs(self, **kwargs):
    """List knowledge bases with pagination."""
    return self._dispatch('list_kbs', **kwargs)

`list_tasks(**kwargs)`

List task records with pagination.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def list_tasks(self, **kwargs):
    """List task records with pagination."""
    return self._dispatch('list_tasks', **kwargs)

`patch_metadata(request)`

Patch document metadata through the /v1/docs/metadata/patch endpoint.

The request body is a MetadataPatchRequest with kb_id, algo_id, and items. Each item targets one document and carries a partial metadata patch in patch.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def patch_metadata(self, request: MetadataPatchRequest):
        """Patch document metadata through the ``/v1/docs/metadata/patch`` endpoint.

The request body is a ``MetadataPatchRequest`` with ``kb_id``, ``algo_id``, and ``items``. Each item targets one
document and carries a partial metadata patch in ``patch``.
"""
        return self._dispatch('patch_metadata', request)

`reparse(request)`

Reparse existing documents through the /v1/docs/reparse endpoint.

The request body is a ReparseRequest with kb_id and doc_ids. Use it after metadata or parsing configuration changes when you want to enqueue new parse tasks for existing documents.

Specify either algo_id to reparse all node groups of that algorithm, or reparse_group (a node-group name) to reparse a single group. The two fields are mutually exclusive — providing both raises a validation error. When neither is provided the first algorithm bound to the knowledge base is used and all its node groups are reparsed.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def reparse(self, request: ReparseRequest):
        """Reparse existing documents through the ``/v1/docs/reparse`` endpoint.

The request body is a ``ReparseRequest`` with ``kb_id`` and ``doc_ids``. Use it after metadata
or parsing configuration changes when you want to enqueue new parse tasks for existing documents.

Specify either ``algo_id`` to reparse all node groups of that algorithm, or ``reparse_group``
(a node-group name) to reparse a single group. The two fields are mutually exclusive — providing
both raises a validation error. When neither is provided the first algorithm bound to the
knowledge base is used and all its node groups are reparsed.
"""
        return self._dispatch('reparse', request)

`set_node_group_lazy_mode(group_name, lazy_mode=None)`

Set the lazy-loading mode for a specific node group.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def set_node_group_lazy_mode(self, group_name: str, lazy_mode: Optional[str] = None):
    """Set the lazy-loading mode for a specific node group."""
    return self._dispatch('set_node_group_lazy_mode', group_name, lazy_mode)

`set_runtime_callback_url(callback_url)`

Update the runtime task callback URL.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def set_runtime_callback_url(self, callback_url: str):
    """Update the runtime task callback URL."""
    return self._dispatch('set_runtime_callback_url', callback_url)

`transfer(request)`

Transfer parsed documents between knowledge bases under the same algorithm.

The request body is a TransferRequest. Each transfer item must provide a unique target_doc_id in the target knowledge base. Transfer across different algorithms is not supported. Optional target_filename and target_file_path can override the destination file name/path recorded for the transferred document.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def transfer(self, request: TransferRequest):
        """Transfer parsed documents between knowledge bases under the same algorithm.

The request body is a ``TransferRequest``. Each transfer item must provide a unique ``target_doc_id`` in the target
knowledge base. Transfer across different algorithms is not supported. Optional ``target_filename`` and
``target_file_path`` can override the destination file name/path recorded for the transferred document.
"""
        return self._dispatch('transfer', request)

`unbind_algo(kb_id, algo_id)`

Unbind one algorithm from a knowledge base and asynchronously clean up parse data for node groups exclusive to that algorithm; node groups shared with other algorithms are preserved.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def unbind_algo(self, kb_id: str, algo_id: str):
    """Unbind one algorithm from a knowledge base and asynchronously clean up parse data for node groups exclusive to that algorithm; node groups shared with other algorithms are preserved."""
    return self._dispatch('unbind_algo', kb_id, algo_id)

`update_kb(kb_id, request)`

Update knowledge-base metadata.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

def update_kb(self, kb_id: str, request: KbUpdateRequest):
    """Update knowledge-base metadata."""
    return self._dispatch('update_kb_by_id', kb_id, request)

`upload(request)`

Upload files into DocServer-managed storage through the /v1/docs/upload flow.

Use this method when you want DocServer to manage uploaded copies of the source files. The request body is an UploadRequest with kb_id, algo_id, and items. Each item uses file_path as the local source path and can optionally include doc_id or metadata.

Returns: Standard API response. data["items"] contains the accepted doc_id and asynchronous task_id.

Source code in lazyllm/tools/rag/doc_service/doc_server.py

    def upload(self, request: UploadRequest):
        """Upload files into DocServer-managed storage through the ``/v1/docs/upload`` flow.

Use this method when you want DocServer to manage uploaded copies of the source files. The request body is an
``UploadRequest`` with ``kb_id``, ``algo_id``, and ``items``. Each item uses ``file_path`` as the local source
path and can optionally include ``doc_id`` or ``metadata``.

**Returns:**
    Standard API response. ``data["items"]`` contains the accepted ``doc_id`` and asynchronous ``task_id``.
"""
        return self._dispatch('upload_request', request)

`lazyllm.tools.rag.doc_service.base.AddFileItem`

Bases: BaseModel

Source code in lazyllm/tools/rag/doc_service/base.py

class AddFileItem(BaseModel):
    file_path: str
    doc_id: Optional[str] = None
    metadata: Dict[str, Any] = Field(default_factory=dict)

    @model_validator(mode='after')
    def validate_file_path(self):
        """Validate the ``file_path`` field of one file item."""
        if not self.file_path or not self.file_path.strip():
            raise ValueError('file_path is required')
        return self

`validate_file_path()`

Validate the file_path field of one file item.

Source code in lazyllm/tools/rag/doc_service/base.py

@model_validator(mode='after')
def validate_file_path(self):
    """Validate the ``file_path`` field of one file item."""
    if not self.file_path or not self.file_path.strip():
        raise ValueError('file_path is required')
    return self

`lazyllm.tools.rag.doc_service.base.UploadRequest = DocItemsRequest` `module-attribute`

`lazyllm.tools.rag.doc_service.base.AddRequest = DocItemsRequest` `module-attribute`

`lazyllm.tools.rag.doc_service.base.TransferItem`

Bases: BaseModel

Source code in lazyllm/tools/rag/doc_service/base.py

class TransferItem(BaseModel):
    doc_id: str
    target_doc_id: str
    kb_id: str = Field(default='__default__', validation_alias=AliasChoices('kb_id', 'source_kb_id'))
    target_kb_id: str
    target_metadata: Optional[Dict[str, Any]] = None
    target_filename: Optional[str] = None
    target_file_path: Optional[str] = None
    mode: str = 'copy'

    @property
    def source_kb_id(self) -> str:
        return self.kb_id

`lazyllm.tools.rag.doc_service.base.TransferRequest`

Bases: BaseModel

Source code in lazyllm/tools/rag/doc_service/base.py

class TransferRequest(BaseModel):
    items: List[TransferItem]
    idempotency_key: Optional[str] = None

    @model_validator(mode='after')
    def validate_items(self):
        """Validate transfer request items and their basic constraints."""
        if not self.items:
            raise ValueError('items is required')
        return self

`validate_items()`

Validate transfer request items and their basic constraints.

Source code in lazyllm/tools/rag/doc_service/base.py

@model_validator(mode='after')
def validate_items(self):
    """Validate transfer request items and their basic constraints."""
    if not self.items:
        raise ValueError('items is required')
    return self

`lazyllm.tools.rag.data_loaders.DirectoryReader`

Read local files with the configured reader registry and return document nodes.

Source code in lazyllm/tools/rag/data_loaders.py

class DirectoryReader:
    """Read local files with the configured reader registry and return document nodes."""

    def __init__(self, input_files: Optional[List[str]], local_readers: Optional[Dict] = None,
                 global_readers: Optional[Dict] = None) -> None:
        """Initialize a directory-backed document reader with local and global reader registries."""
        self._input_files = input_files
        self._local_readers, self._global_readers = local_readers, global_readers

    def _reader_entry_sig(self, reader) -> str:
        if reader is None:
            return '__none__'
        if inspect.isclass(reader):
            return f'{reader.__module__}.{reader.__qualname__}'
        qualname = getattr(reader, '__qualname__', None)
        module = getattr(reader, '__module__', None)
        if qualname and '<lambda>' not in qualname:
            return f'{module}.{qualname}' if module else qualname
        try:
            src = inspect.getsource(reader).strip()
            return '__lambda__::' + hashlib.sha256(src.encode()).hexdigest()[:16]
        except (OSError, TypeError):
            return repr(reader)

    def signature(self) -> str:
        """Compute a fingerprint hash of the current reader configuration to detect registry changes.

Serialises the local and global reader mappings to JSON and returns the first 16 hex characters
of the SHA-256 digest. The value changes whenever any reader is replaced or added, making it
suitable for deciding whether documents need to be re-parsed.

**Returns:**

- str: 16-character hexadecimal fingerprint string.
"""
        local_sig = {k: self._reader_entry_sig(v) for k, v in (self._local_readers or {}).items()}
        global_sig = {k: self._reader_entry_sig(v) for k, v in (self._global_readers or {}).items()}
        payload = json.dumps({'local_readers': local_sig, 'global_readers': global_sig}, sort_keys=True)
        return hashlib.sha256(payload.encode()).hexdigest()[:16]

    @once_wrapper
    def _lazy_init(self):
        self._reader = SimpleDirectoryReader(
            file_extractor={**(self._global_readers or {}), **(self._local_readers or {})})

    def load_data(self, input_files: Optional[List[str]] = None, metadatas: Optional[Dict] = None,
                  *, split_nodes_by_type: bool = False) -> List[DocNode]:
        """Load documents from files and optionally split the result by node type."""
        self._lazy_init()
        input_files = input_files or self._input_files
        nodes: Union[List[DocNode], Dict[str, List[DocNode]]] = defaultdict(list) if split_nodes_by_type else []
        for doc in self._reader(input_files=input_files, metadatas=metadatas):
            doc._group = type_mapping.get(type(doc), LAZY_ROOT_NAME)
            nodes[doc._group].append(doc) if split_nodes_by_type else nodes.append(doc)
        if not nodes:
            raise ValueError(f'No nodes load from path {input_files}, please check your data path.')
        LOG.info('DirectoryReader loads data done!')
        return nodes

`init(input_files, local_readers=None, global_readers=None)`

Initialize a directory-backed document reader with local and global reader registries.

Source code in lazyllm/tools/rag/data_loaders.py

def __init__(self, input_files: Optional[List[str]], local_readers: Optional[Dict] = None,
             global_readers: Optional[Dict] = None) -> None:
    """Initialize a directory-backed document reader with local and global reader registries."""
    self._input_files = input_files
    self._local_readers, self._global_readers = local_readers, global_readers

`load_data(input_files=None, metadatas=None, *, split_nodes_by_type=False)`

Load documents from files and optionally split the result by node type.

Source code in lazyllm/tools/rag/data_loaders.py

def load_data(self, input_files: Optional[List[str]] = None, metadatas: Optional[Dict] = None,
              *, split_nodes_by_type: bool = False) -> List[DocNode]:
    """Load documents from files and optionally split the result by node type."""
    self._lazy_init()
    input_files = input_files or self._input_files
    nodes: Union[List[DocNode], Dict[str, List[DocNode]]] = defaultdict(list) if split_nodes_by_type else []
    for doc in self._reader(input_files=input_files, metadatas=metadatas):
        doc._group = type_mapping.get(type(doc), LAZY_ROOT_NAME)
        nodes[doc._group].append(doc) if split_nodes_by_type else nodes.append(doc)
    if not nodes:
        raise ValueError(f'No nodes load from path {input_files}, please check your data path.')
    LOG.info('DirectoryReader loads data done!')
    return nodes

`signature()`

Compute a fingerprint hash of the current reader configuration to detect registry changes.

Serialises the local and global reader mappings to JSON and returns the first 16 hex characters of the SHA-256 digest. The value changes whenever any reader is replaced or added, making it suitable for deciding whether documents need to be re-parsed.

Returns:

str: 16-character hexadecimal fingerprint string.

Source code in lazyllm/tools/rag/data_loaders.py

    def signature(self) -> str:
        """Compute a fingerprint hash of the current reader configuration to detect registry changes.

Serialises the local and global reader mappings to JSON and returns the first 16 hex characters
of the SHA-256 digest. The value changes whenever any reader is replaced or added, making it
suitable for deciding whether documents need to be re-parsed.

**Returns:**

- str: 16-character hexadecimal fingerprint string.
"""
        local_sig = {k: self._reader_entry_sig(v) for k, v in (self._local_readers or {}).items()}
        global_sig = {k: self._reader_entry_sig(v) for k, v in (self._global_readers or {}).items()}
        payload = json.dumps({'local_readers': local_sig, 'global_readers': global_sig}, sort_keys=True)
        return hashlib.sha256(payload.encode()).hexdigest()[:16]

`lazyllm.tools.rag.transform.sentence.SentenceSplitter`

Bases: _TextSplitterBase

Split sentences into chunks of a specified size. You can specify the size of the overlap between adjacent chunks.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
chunk_overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
**kwargs –

Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, SentenceSplitter
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="sentences", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)

Source code in lazyllm/tools/rag/transform/sentence.py

@cpp_proxy
class SentenceSplitter(_TextSplitterBase):
    """
Split sentences into chunks of a specified size. You can specify the size of the overlap between adjacent chunks.

Args:
    chunk_size (int): The size of the chunk after splitting.
    chunk_overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, SentenceSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="sentences", transform=SentenceSplitter, chunk_size=1024, chunk_overlap=100)
    """
    def __init__(self, chunk_size: int = _UNSET, chunk_overlap: int = _UNSET, num_workers: int = _UNSET):
        super().__init__(chunk_size=chunk_size, overlap=chunk_overlap, num_workers=num_workers)

    def sig_fields(self) -> Dict:
        return {'chunk_size': self._chunk_size, 'chunk_overlap': self._overlap}

    def _merge(self, splits: List[_Split], chunk_size: int) -> List[str]:
        chunks: List[str] = []
        cur_chunk: List[Tuple[str, int]] = []  # list of (text, length)
        cur_chunk_len = 0
        is_chunk_new = True

        def close_chunk() -> None:
            nonlocal cur_chunk, cur_chunk_len, is_chunk_new

            chunks.append(''.join([text for text, _ in cur_chunk]))
            last_chunk = cur_chunk
            cur_chunk = []
            cur_chunk_len = 0
            is_chunk_new = True

            # Add overlap to the next chunk using the last one first
            overlap_len = 0
            for text, length in reversed(last_chunk):
                if overlap_len + length > self._overlap:
                    break
                cur_chunk.append((text, length))
                overlap_len += length
                cur_chunk_len += length
            cur_chunk.reverse()

        i = 0
        while i < len(splits):
            cur_split = splits[i]
            if cur_split.token_size > chunk_size:
                raise ValueError('Single token exceeded chunk size')
            if cur_chunk_len + cur_split.token_size > chunk_size and not is_chunk_new:
                # if adding split to current chunk exceeds chunk size
                close_chunk()
            else:
                if (
                    cur_split.is_sentence
                    or cur_chunk_len + cur_split.token_size <= chunk_size
                    or is_chunk_new  # new chunk, always add at least one split
                ):
                    # add split to chunk
                    cur_chunk_len += cur_split.token_size
                    cur_chunk.append((cur_split.text, cur_split.token_size))
                    i += 1
                    is_chunk_new = False
                else:
                    close_chunk()

        # handle the last chunk
        if not is_chunk_new:
            chunks.append(''.join([text for text, _ in cur_chunk]))

        # Remove whitespace only chunks and remove leading and trailing whitespace.
        return [stripped_chunk for chunk in chunks if (stripped_chunk := chunk.strip())]

`lazyllm.tools.rag.transform.character.CharacterSplitter`

Bases: _TextSplitterBase

Split text by characters.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
chunk_overlap (int) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
separator (str, default: _UNSET ) –

The separator to use for splitting. Defaults to ' '.
is_separator_regex (bool, default: _UNSET ) –

Whether the separator is a regular expression. Defaults to False.
keep_separator (bool, default: _UNSET ) –

Whether to keep the separator in the split text. Defaults to False.
**kwargs –

Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Document, CharacterSplitter
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="characters", transform=CharacterSplitter, chunk_size=1024, chunk_overlap=100)

Source code in lazyllm/tools/rag/transform/character.py

class CharacterSplitter(_TextSplitterBase):
    """
Split text by characters.

Args:
    chunk_size (int): The size of the chunk after splitting.
    chunk_overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    separator (str): The separator to use for splitting. Defaults to ' '.
    is_separator_regex (bool): Whether the separator is a regular expression. Defaults to False.
    keep_separator (bool): Whether to keep the separator in the split text. Defaults to False.
    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Document, CharacterSplitter
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="characters", transform=CharacterSplitter, chunk_size=1024, chunk_overlap=100)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 separator: str = _UNSET, is_separator_regex: bool = _UNSET, keep_separator: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers)
        separator = self._get_param_value('separator', separator, ' ')
        is_separator_regex = self._get_param_value('is_separator_regex', is_separator_regex, False)
        keep_separator = self._get_param_value('keep_separator', keep_separator, False)

        self._separator = separator
        self._is_separator_regex = is_separator_regex
        self._keep_separator = keep_separator
        self._character_split_fns = []
        self._cached_sep_pattern = self._get_separator_pattern(self._separator)
        self._cached_default_split_fns = None

    def sig_fields(self) -> Dict:
        fns_sig = [_callable_sig(fn) for fn in self._character_split_fns] if self._character_split_fns else []
        return {
            'chunk_size': self._chunk_size, 'overlap': self._overlap,
            'separator': self._separator, 'is_separator_regex': self._is_separator_regex,
            'keep_separator': self._keep_separator,
            'split_fns': fns_sig,
        }

    def _split(self, text: str, chunk_size: int) -> List[_Split]:
        token_size = self._token_size(text)
        if token_size <= chunk_size:
            return [_Split(text, is_sentence=True, token_size=token_size)]

        text_splits, is_sentence = self._get_splits_by_fns(text)

        if len(text_splits) == 1 and self._token_size(text_splits[0]) > chunk_size:
            token_splitter = _TokenTextSplitter(chunk_size=chunk_size, overlap=self._overlap)
            token_sub_texts = token_splitter.split_text(text_splits[0], metadata_size=0)
            return [
                _Split(s, is_sentence=is_sentence, token_size=self._token_size(s))
                for s in token_sub_texts
            ]

        results = []
        for segment in text_splits:
            token_size = self._token_size(segment)
            if token_size <= chunk_size:
                results.append(_Split(segment, is_sentence=is_sentence, token_size=token_size))
            else:
                sub_results = self._split(segment, chunk_size=chunk_size)
                results.extend(sub_results)

        return results

    def set_split_fns(self, split_fns: Union[Callable[[str], List[str]], List[Callable[[str], List[str]]]], bind_separator: bool = None):  # noqa: E501
        """
CharacterSplitter has default split functions, you can also set the split functions for the CharacterSplitter.
You can set multiple split functions, and the CharacterSplitter will use them in order, the separator parameter will be ignored.

Args:
    split_fns (List[Callable[[str], List[str]]]): The split functions to use.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.set_split_fns([lambda text: text.split(' '), lambda text: text.split('
    ')])
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        if not isinstance(split_fns, list):
            split_fns = [split_fns]
        self._character_split_fns = []
        for split_fn in split_fns:
            if bind_separator is None:
                sig = inspect.signature(split_fn)
                has_separator = 'separator' in sig.parameters
                should_bind = has_separator
            else:
                should_bind = bind_separator

            if should_bind:
                fn = partial(split_fn, separator=self._separator)
            else:
                fn = split_fn

            self._character_split_fns.append(fn)

    def add_split_fn(self, split_fn: Callable[[str], List[str]], index: Optional[int] = None, bind_separator: bool = None):  # noqa: E501
        """
Add a split function to the CharacterSplitter.

Args:
    split_fn (Callable[[str], List[str]]): The split function to add.
    index (Optional[int]): The index to add the split function. Default to the last position.
    bind_separator (bool): Whether to bind the separator to the split function. Default to False.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.add_split_fn(lambda text: text.split(' '), index=0)
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        if bind_separator is None:
            sig = inspect.signature(split_fn)
            has_separator = 'separator' in sig.parameters
            should_bind = has_separator
        else:
            should_bind = bind_separator

        if should_bind:
            fn = partial(split_fn, separator=self._separator)
        else:
            fn = split_fn

        if index is None:
            self._character_split_fns.append(fn)
        else:
            self._character_split_fns.insert(index, fn)

    def clear_split_fns(self):
        """
Clear all split functions from the CharacterSplitter, and use the default split functions.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.clear_split_fns()
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        self._character_split_fns = []

    def _get_splits_by_fns(self, text: str) -> Tuple[List[str], bool]:
        character_split_fns = self._character_split_fns
        if character_split_fns == []:
            if self._cached_default_split_fns is None:
                self._cached_default_split_fns = [
                    partial(self._default_split, self._cached_sep_pattern),
                    lambda t: t.split(' '),
                    list
                ]
            character_split_fns = self._cached_default_split_fns

        splits = []
        for split_fn in character_split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                break

        return splits, False

    def _default_split(self, sep_pattern: Union[str, set[str]], text: str) -> List[str]:
        splits = re.split(sep_pattern, text)
        results = []
        if self._keep_separator:
            for i in range(0, len(splits) - 1, 2):
                if i + 1 < len(splits):
                    combined = splits[i] + splits[i + 1]
                    if combined:
                        results.append(combined)
            if len(splits) % 2 == 1 and splits[-1]:
                results.append(splits[-1])
        else:
            results = [split for split in splits if split]
        return results

    def _get_separator_pattern(self, separator: str) -> Union[str, set[str]]:
        lookaround_prefixes = ('(?=', '(?<!', '(?<=', '(?!')
        lookaround_pattern = re.compile(r'^\(\?(?:=|<=|!|<!)')

        is_lookaround = (
            self._is_separator_regex
            and (separator.startswith(lookaround_prefixes) or bool(lookaround_pattern.match(separator)))
        )

        if self._is_separator_regex or is_lookaround:
            sep_pattern = separator
        else:
            needs_escape = any(char in separator for char in r'\.^$*+?{}[]|()')
            sep_pattern = re.escape(separator) if needs_escape else separator

        if self._keep_separator:
            sep_pattern = f'({sep_pattern})'
        else:
            sep_pattern = f'(?:{sep_pattern})'

        return sep_pattern

`add_split_fn(split_fn, index=None, bind_separator=None)`

Add a split function to the CharacterSplitter.

Parameters:

split_fn (Callable[[str], List[str]]) –

The split function to add.
index (Optional[int], default: None ) –

The index to add the split function. Default to the last position.
bind_separator (bool, default: None ) –

Whether to bind the separator to the split function. Default to False.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CharacterSplitter
>>> splitter = CharacterSplitter(separator='
')
>>> splitter.add_split_fn(lambda text: text.split(' '), index=0)
>>> text = 'Hello, world!'
>>> splits = splitter.split_text(text, metadata_size=0)
>>> print(splits)

Source code in lazyllm/tools/rag/transform/character.py

    def add_split_fn(self, split_fn: Callable[[str], List[str]], index: Optional[int] = None, bind_separator: bool = None):  # noqa: E501
        """
Add a split function to the CharacterSplitter.

Args:
    split_fn (Callable[[str], List[str]]): The split function to add.
    index (Optional[int]): The index to add the split function. Default to the last position.
    bind_separator (bool): Whether to bind the separator to the split function. Default to False.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.add_split_fn(lambda text: text.split(' '), index=0)
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        if bind_separator is None:
            sig = inspect.signature(split_fn)
            has_separator = 'separator' in sig.parameters
            should_bind = has_separator
        else:
            should_bind = bind_separator

        if should_bind:
            fn = partial(split_fn, separator=self._separator)
        else:
            fn = split_fn

        if index is None:
            self._character_split_fns.append(fn)
        else:
            self._character_split_fns.insert(index, fn)

`clear_split_fns()`

Clear all split functions from the CharacterSplitter, and use the default split functions.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CharacterSplitter
>>> splitter = CharacterSplitter(separator='
')
>>> splitter.clear_split_fns()
>>> text = 'Hello, world!'
>>> splits = splitter.split_text(text, metadata_size=0)
>>> print(splits)

Source code in lazyllm/tools/rag/transform/character.py

    def clear_split_fns(self):
        """
Clear all split functions from the CharacterSplitter, and use the default split functions.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.clear_split_fns()
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        self._character_split_fns = []

`set_split_fns(split_fns, bind_separator=None)`

CharacterSplitter has default split functions, you can also set the split functions for the CharacterSplitter. You can set multiple split functions, and the CharacterSplitter will use them in order, the separator parameter will be ignored.

Parameters:

split_fns (List[Callable[[str], List[str]]]) –

The split functions to use.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CharacterSplitter
>>> splitter = CharacterSplitter(separator='
')
>>> splitter.set_split_fns([lambda text: text.split(' '), lambda text: text.split('
')])
>>> text = 'Hello, world!'
>>> splits = splitter.split_text(text, metadata_size=0)
>>> print(splits)

Source code in lazyllm/tools/rag/transform/character.py

    def set_split_fns(self, split_fns: Union[Callable[[str], List[str]], List[Callable[[str], List[str]]]], bind_separator: bool = None):  # noqa: E501
        """
CharacterSplitter has default split functions, you can also set the split functions for the CharacterSplitter.
You can set multiple split functions, and the CharacterSplitter will use them in order, the separator parameter will be ignored.

Args:
    split_fns (List[Callable[[str], List[str]]]): The split functions to use.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CharacterSplitter
    >>> splitter = CharacterSplitter(separator='
    ')
    >>> splitter.set_split_fns([lambda text: text.split(' '), lambda text: text.split('
    ')])
    >>> text = 'Hello, world!'
    >>> splits = splitter.split_text(text, metadata_size=0)
    >>> print(splits)
    """
        if not isinstance(split_fns, list):
            split_fns = [split_fns]
        self._character_split_fns = []
        for split_fn in split_fns:
            if bind_separator is None:
                sig = inspect.signature(split_fn)
                has_separator = 'separator' in sig.parameters
                should_bind = has_separator
            else:
                should_bind = bind_separator

            if should_bind:
                fn = partial(split_fn, separator=self._separator)
            else:
                fn = split_fn

            self._character_split_fns.append(fn)

`lazyllm.tools.rag.transform.recursive.RecursiveSplitter`

Bases: CharacterSplitter

Split text by characters recursively.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
keep_separator (bool, default: _UNSET ) –

Whether to keep the separator in the split text. Defaults to False.
is_separator_regex (bool, default: _UNSET ) –

Whether the separator is a regular expression. Defaults to False.
separators (List[str], default: _UNSET ) –

The separators to use for splitting. Defaults to ['

', ' ', ' ', '']. If you want to split by multiple separators, you can set this parameter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RecursiveSplitter
>>> splitter = RecursiveSplitter(separators=['

', '
', ' ', ''])
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="recursive", transform=RecursiveSplitter, chunk_size=1024, chunk_overlap=100)

Source code in lazyllm/tools/rag/transform/recursive.py

class RecursiveSplitter(CharacterSplitter):
    """
Split text by characters recursively.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    keep_separator (bool): Whether to keep the separator in the split text. Defaults to False.
    is_separator_regex (bool): Whether the separator is a regular expression. Defaults to False.
    separators (List[str]): The separators to use for splitting. Defaults to ['

', '
', ' ', '']. If you want to split by multiple separators, you can set this parameter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RecursiveSplitter
    >>> splitter = RecursiveSplitter(separators=['

    ', '
    ', ' ', ''])
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="recursive", transform=RecursiveSplitter, chunk_size=1024, chunk_overlap=100)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 keep_separator: bool = _UNSET, is_separator_regex: bool = _UNSET,
                 separators: List[str] = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         keep_separator=keep_separator, is_separator_regex=is_separator_regex)
        separators = self._get_param_value('separators', separators, None)

        self._separators = separators if separators else ['\n\n', '\n', ' ', '']
        self._cached_recursive_split_fns = [
            partial(self._default_split, self._get_separator_pattern(sep))
            for sep in self._separators
        ] + [list]

    def sig_fields(self) -> Dict:
        base = super().sig_fields()
        base.update({'separators': self._separators})
        return base

    def _get_splits_by_fns(self, text: str) -> Tuple[List[str], bool]:
        character_split_fns = self._character_split_fns
        if character_split_fns == []:
            character_split_fns = self._cached_recursive_split_fns
        splits = []
        for split_fn in character_split_fns:
            splits = split_fn(text)
            if len(splits) > 1:
                break

        return splits, False

`lazyllm.tools.rag.transform.markdown.MarkdownSplitter`

Bases: _TextSplitterBase

Split markdown text by headers recursively.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
keep_trace (bool, default: _UNSET ) –

Whether to keep the trace of the markdown text. Defaults to False.
keep_headers (bool, default: _UNSET ) –

Whether to keep the headers in the split text. Defaults to False.
keep_lists (bool, default: _UNSET ) –

Whether to keep the lists in the split text. Defaults to False.
keep_code_blocks (bool, default: _UNSET ) –

Whether to keep the code blocks in the split text. Defaults to False.
keep_tables (bool, default: _UNSET ) –

Whether to keep the tables in the split text. Defaults to False.
keep_images (bool, default: _UNSET ) –

Whether to keep the images in the split text. Defaults to False.
keep_links (bool, default: _UNSET ) –

Whether to keep the links in the split text. Defaults to False.
**kwargs –

Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import MarkdownSplitter
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="markdown", transform=MarkdownSplitter,
                                chunk_size=1024, chunk_overlap=100, keep_trace=True, keep_headers=True)

Source code in lazyllm/tools/rag/transform/markdown.py

class MarkdownSplitter(_TextSplitterBase):
    """
Split markdown text by headers recursively.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    keep_trace (bool): Whether to keep the trace of the markdown text. Defaults to False.
    keep_headers (bool): Whether to keep the headers in the split text. Defaults to False.
    keep_lists (bool): Whether to keep the lists in the split text. Defaults to False.
    keep_code_blocks (bool): Whether to keep the code blocks in the split text. Defaults to False.
    keep_tables (bool): Whether to keep the tables in the split text. Defaults to False.
    keep_images (bool): Whether to keep the images in the split text. Defaults to False.
    keep_links (bool): Whether to keep the links in the split text. Defaults to False.
    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import MarkdownSplitter
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="markdown", transform=MarkdownSplitter,
                                    chunk_size=1024, chunk_overlap=100, keep_trace=True, keep_headers=True)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 keep_trace: bool = _UNSET, keep_headers: bool = _UNSET, keep_lists: bool = _UNSET,
                 keep_code_blocks: bool = _UNSET, keep_tables: bool = _UNSET, keep_images: bool = _UNSET,
                 keep_links: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers)

        chunk_size = self._chunk_size
        keep_trace = self._get_param_value('keep_trace', keep_trace, False)
        keep_headers = self._get_param_value('keep_headers', keep_headers, False)
        keep_lists = self._get_param_value('keep_lists', keep_lists, False)
        keep_code_blocks = self._get_param_value('keep_code_blocks', keep_code_blocks, False)
        keep_tables = self._get_param_value('keep_tables', keep_tables, False)
        keep_images = self._get_param_value('keep_images', keep_images, False)
        keep_links = self._get_param_value('keep_links', keep_links, False)

        if chunk_size <= 200:
            LOG.warning(f'Chunk size {chunk_size} is too small, may cause unexpected splits')
        self.keep_trace = keep_trace
        self.keep_headers = keep_headers
        self.keep_lists = keep_lists
        self.keep_code_blocks = keep_code_blocks
        self.keep_tables = keep_tables
        self.keep_images = keep_images
        self.keep_links = keep_links

    def sig_fields(self) -> Dict:
        return {
            'chunk_size': self._chunk_size, 'overlap': self._overlap,
            'keep_trace': self.keep_trace, 'keep_headers': self.keep_headers,
            'keep_lists': self.keep_lists, 'keep_code_blocks': self.keep_code_blocks,
            'keep_tables': self.keep_tables, 'keep_images': self.keep_images,
            'keep_links': self.keep_links,
        }

    def _split(self, text: str, chunk_size: int) -> List[_MdSplit]:
        splits = self.split_markdown_by_semantics(text)
        if self.keep_code_blocks:
            splits = self._keep_code_blocks(splits)
        if self.keep_tables:
            splits = self._keep_tables(splits)
        if self.keep_images:
            splits = self._keep_images(splits)
        results = []
        for split in splits:
            results.extend(self._sub_split(split, chunk_size))
        return results

    def _split_code_block_by_lines(self, text: str) -> List[str]:
        lines = text.split('\n')
        return [line for line in lines if line.strip()]

    def _sub_split(self, split: _MdSplit, chunk_size: int) -> List[_MdSplit]:
        token_size = split.token_size
        if token_size <= chunk_size:
            return [split]

        is_code_or_table = (split.type != 'content' and split.type not in ['image', 'link'])

        if is_code_or_table:
            lines = split.content.split('\n')
            text_splits = []
            current_chunk = []
            current_size = 0

            for line in lines:
                line_size = self._token_size(line + '\n')
                if line_size > chunk_size:
                    if current_chunk:
                        text_splits.append('\n'.join(current_chunk))
                        current_chunk = []
                        current_size = 0
                    text_splits.append(line)
                elif current_size + line_size > chunk_size:
                    if current_chunk:
                        text_splits.append('\n'.join(current_chunk))
                    current_chunk = [line]
                    current_size = line_size
                else:
                    current_chunk.append(line)
                    current_size += line_size

            if current_chunk:
                text_splits.append('\n'.join(current_chunk))
        else:
            text_splits, _ = self._get_splits_by_fns(split.content)

        results = []
        for segment in text_splits:
            token_size = self._token_size(segment)
            if token_size <= chunk_size:
                results.append(_MdSplit(
                    path=split.path, level=split.level,
                    header=split.header, content=segment,
                    token_size=token_size, type=split.type,
                ))
            else:
                new_split = _MdSplit(
                    path=split.path, level=split.level,
                    header=split.header, content=segment,
                    token_size=token_size, type=split.type
                )
                results.extend(self._sub_split(new_split, chunk_size=chunk_size))

        return results

    def _keep_tables(self, splits: List[_MdSplit]) -> List[_MdSplit]:
        pattern = re.compile(
            r'(?P<table>(?:^\s*\|.*\|\s*$\n?){2,})',
            re.MULTILINE
        )
        results = self._keep_elements(splits, pattern, 'table')

        return results

    def _keep_code_blocks(self, splits: List[_MdSplit]) -> List[_MdSplit]:
        pattern = re.compile(
            r'```([\w+-]*)\s*(.*?)```',
            re.DOTALL
        )
        results = self._keep_elements(splits, pattern, 'code')

        return results

    def _keep_images(self, splits: List[_MdSplit]) -> List[_MdSplit]:
        pattern = re.compile(
            r'!\[([^\]]*)\]\(([^\)]+)\)',
            re.MULTILINE
        )
        results = self._keep_elements(splits, pattern, 'image')

        return results

    def _keep_links(self, splits: List[_MdSplit]) -> List[_MdSplit]:
        pattern = re.compile(
            r'(?<!!)\[([^\]]+)\]\(([^\)]+)\)',
            re.MULTILINE
        )
        results = self._keep_elements(splits, pattern, 'link')

        return results

    def _keep_lists(self, splits: List[_MdSplit]) -> List[_MdSplit]:
        pattern = re.compile(
            r'(?P<list>(?:^\s*(?:[-*+]|\d+\.)\s+.*$\n?){1,})',
            re.MULTILINE
        )
        results = self._keep_elements(splits, pattern, 'list')

        return results

    def _get_heading_level(self, line: str) -> int:
        line = line.split('\n')[0].strip()
        match = re.match(r'^(#{1,6})\s+(.*)$', line)
        if not match:
            return 0

        level = len(match.group(1))
        content = match.group(2).strip()
        if not content:
            return 0

        if level == 6 and content.startswith('#'):
            return 0

        return level

    def _get_code_block_ranges(self, text: str) -> List[tuple]:
        code_block_pattern = re.compile(r'```[\w+-]*\s*.*?```', re.DOTALL)
        ranges = []
        for match in code_block_pattern.finditer(text):
            ranges.append((match.start(), match.end()))
        return ranges

    def _is_in_code_block(self, pos: int, code_ranges: List[tuple]) -> bool:
        for start, end in code_ranges:
            if start <= pos < end:
                return True
        return False

    def split_markdown_by_semantics(self, md_text: str) -> List[_MdSplit]:
        """
Split markdown text by semantics.

Args:
    md_text (str): The markdown text to split.
    **kwargs: Additional parameters passed to the splitter.

**Returns:**

- List[_MdSplit]: The split text with markdown semantics in metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import MarkdownSplitter
    >>> splitter = MarkdownSplitter(keep_trace=True, keep_headers=True, keep_lists=True, keep_code_blocks=True, keep_tables=True, keep_images=True, keep_links=True)
    >>> md_text = '# Hello, world!
    ## Hello, world!
    ### Hello, world!
    ### Hello, world!'
    >>> splits = splitter.split_markdown_by_semantics(md_text)
    >>> print(splits)
    """
        code_ranges = self._get_code_block_ranges(md_text)

        heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
        heading_positions = []

        for match in heading_pattern.finditer(md_text):
            if not self._is_in_code_block(match.start(), code_ranges):
                heading_positions.append(match.start())

        results = []
        path_stack = []
        level_stack = []

        if heading_positions:
            first_heading_pos = heading_positions[0]
            if first_heading_pos > 0:
                content_before = md_text[:first_heading_pos].strip()
                if content_before:
                    results.append(_MdSplit(
                        path=[],
                        level=0,
                        header=None,
                        content=content_before,
                        token_size=self._token_size(content_before),
                        type='content'
                    ))
        else:
            content = md_text.strip()
            if content:
                results.append(_MdSplit(
                    path=[],
                    level=0,
                    header=None,
                    content=content,
                    token_size=self._token_size(content),
                    type='content'
                ))
            return results

        for i, heading_pos in enumerate(heading_positions):
            if i + 1 < len(heading_positions):
                end_pos = heading_positions[i + 1]
            else:
                end_pos = len(md_text)

            block = md_text[heading_pos:end_pos].rstrip('\n')
            lines = block.split('\n')

            heading_line = lines[0]
            level = self._get_heading_level(heading_line)

            if level == 0:
                header = None
                content = block.strip()
            else:
                header = heading_line.strip('#').strip()
                content = '\n'.join(lines[1:]).strip()

                while level_stack and level_stack[-1] >= level:
                    path_stack.pop()
                    level_stack.pop()
                path_stack.append(header)
                level_stack.append(level)

            results.append(_MdSplit(
                path=path_stack.copy(),
                level=level,
                header=header,
                content=content,
                token_size=self._token_size(content),
                type='content'
            ))

        return results

    def _keep_elements(self, splits: List[_MdSplit], pattern: re.Pattern, type: str) -> List[_MdSplit]:
        results = []

        for split in splits:
            content = split.content
            matches = list(pattern.finditer(content))
            if not matches:
                results.append(split)
                continue

            last_end = 0
            for match in matches:
                start, end = match.span()

                if type == 'code':
                    lang = match.group(1).strip() if match.group(1) else 'code'
                    element_part = match.group(2)
                    element_type = lang
                else:
                    element_part = match.group()
                    element_type = type

                if start > last_end:
                    text_part = content[last_end:start].strip()
                    if text_part:
                        results.append(_MdSplit(
                            path=split.path,
                            level=split.level,
                            header=split.header,
                            content=text_part,
                            token_size=self._token_size(text_part),
                            type='content'
                        ))

                element_part = element_part.strip()
                results.append(_MdSplit(
                    path=split.path,
                    level=split.level,
                    header=split.header,
                    content=element_part,
                    token_size=self._token_size(element_part),
                    type=element_type,
                ))

                last_end = end

            if last_end < len(content):
                tail = content[last_end:].strip()
                if tail:
                    results.append(_MdSplit(
                        path=split.path,
                        level=split.level,
                        header=split.header,
                        content=tail,
                        token_size=self._token_size(tail),
                        type='content'
                    ))

        return results

    def _merge(self, splits: List[_MdSplit], chunk_size: int) -> List[DocNode]:
        if not splits:
            return []

        if len(splits) == 1:
            return [self._to_docnode(splits[0])]

        end_split = splits[-1]
        if end_split.token_size == chunk_size and self._overlap > 0:
            splits.pop()

            def cut_split(split: _MdSplit) -> List[_MdSplit]:
                text = split.content
                text_tokens = self.token_encoder(text)
                p_text = self.token_decoder(text_tokens[:len(text_tokens) // 2])
                n_text = self.token_decoder(text_tokens[len(text_tokens) // 2:])
                return [
                    _MdSplit(
                        path=split.path, level=split.level, header=split.header,
                        content=p_text, token_size=self._token_size(p_text), type=split.type
                    ),
                    _MdSplit(
                        path=split.path, level=split.level, header=split.header,
                        content=n_text, token_size=self._token_size(n_text), type=split.type
                    ),
                ]
            splits.extend(cut_split(end_split))
            end_split = splits[-1]
        result = []
        for idx in range(len(splits) - 2, -1, -1):
            start_split = splits[idx]
            if start_split.path == end_split.path and end_split.type == start_split.type:
                if (
                    start_split.token_size <= self._overlap
                    and end_split.token_size <= chunk_size - self._overlap
                ):
                    type = end_split.type
                    token_size = start_split.token_size + end_split.token_size
                    content = start_split.content + end_split.content
                    end_split = _MdSplit(
                        path=start_split.path, level=start_split.level,
                        header=start_split.header, content=content,
                        token_size=token_size, type=type
                    )
                    continue
                else:
                    if end_split.token_size > chunk_size:
                        raise ValueError(f'split token size ({end_split.token_size}) \
                                        is greater than chunk size ({chunk_size}).')
                    else:
                        remaining_space = chunk_size - end_split.token_size
                        overlap_len = min(self._overlap, remaining_space, start_split.token_size)

                        if overlap_len > 0:
                            start_tokens = self.token_encoder(start_split.content)
                            overlap_tokens = start_tokens[-overlap_len:]
                            overlap_text = self.token_decoder(overlap_tokens)

                            type = start_split.type
                            token_size = end_split.token_size + overlap_len
                            content = overlap_text + end_split.content
                            end_split = _MdSplit(
                                path=start_split.path, level=start_split.level,
                                header=start_split.header, content=content,
                                token_size=token_size, type=type
                            )

                        result.insert(0, self._to_docnode(end_split))
                        end_split = start_split
            else:
                result.insert(0, self._to_docnode(end_split))
                end_split = start_split
        result.insert(0, self._to_docnode(end_split))
        return result

    def _to_docnode(self, split: _MdSplit) -> DocNode:
        metadata = {
            'path': split.path if self.keep_trace else None,
            'level': split.level,
            'header': split.header if self.keep_headers else None,
            'token_size': split.token_size,
            'type': split.type,
        }

        return DocNode(
            metadata=metadata,
            content=split.content,
        )

`split_markdown_by_semantics(md_text)`

Split markdown text by semantics.

Parameters:

md_text (str) –

The markdown text to split.
**kwargs –

Additional parameters passed to the splitter.

Returns:

List[_MdSplit]: The split text with markdown semantics in metadata.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import MarkdownSplitter
>>> splitter = MarkdownSplitter(keep_trace=True, keep_headers=True, keep_lists=True, keep_code_blocks=True, keep_tables=True, keep_images=True, keep_links=True)
>>> md_text = '# Hello, world!
## Hello, world!
### Hello, world!
### Hello, world!'
>>> splits = splitter.split_markdown_by_semantics(md_text)
>>> print(splits)

Source code in lazyllm/tools/rag/transform/markdown.py

    def split_markdown_by_semantics(self, md_text: str) -> List[_MdSplit]:
        """
Split markdown text by semantics.

Args:
    md_text (str): The markdown text to split.
    **kwargs: Additional parameters passed to the splitter.

**Returns:**

- List[_MdSplit]: The split text with markdown semantics in metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import MarkdownSplitter
    >>> splitter = MarkdownSplitter(keep_trace=True, keep_headers=True, keep_lists=True, keep_code_blocks=True, keep_tables=True, keep_images=True, keep_links=True)
    >>> md_text = '# Hello, world!
    ## Hello, world!
    ### Hello, world!
    ### Hello, world!'
    >>> splits = splitter.split_markdown_by_semantics(md_text)
    >>> print(splits)
    """
        code_ranges = self._get_code_block_ranges(md_text)

        heading_pattern = re.compile(r'^(#{1,6})\s+(.+)$', re.MULTILINE)
        heading_positions = []

        for match in heading_pattern.finditer(md_text):
            if not self._is_in_code_block(match.start(), code_ranges):
                heading_positions.append(match.start())

        results = []
        path_stack = []
        level_stack = []

        if heading_positions:
            first_heading_pos = heading_positions[0]
            if first_heading_pos > 0:
                content_before = md_text[:first_heading_pos].strip()
                if content_before:
                    results.append(_MdSplit(
                        path=[],
                        level=0,
                        header=None,
                        content=content_before,
                        token_size=self._token_size(content_before),
                        type='content'
                    ))
        else:
            content = md_text.strip()
            if content:
                results.append(_MdSplit(
                    path=[],
                    level=0,
                    header=None,
                    content=content,
                    token_size=self._token_size(content),
                    type='content'
                ))
            return results

        for i, heading_pos in enumerate(heading_positions):
            if i + 1 < len(heading_positions):
                end_pos = heading_positions[i + 1]
            else:
                end_pos = len(md_text)

            block = md_text[heading_pos:end_pos].rstrip('\n')
            lines = block.split('\n')

            heading_line = lines[0]
            level = self._get_heading_level(heading_line)

            if level == 0:
                header = None
                content = block.strip()
            else:
                header = heading_line.strip('#').strip()
                content = '\n'.join(lines[1:]).strip()

                while level_stack and level_stack[-1] >= level:
                    path_stack.pop()
                    level_stack.pop()
                path_stack.append(header)
                level_stack.append(level)

            results.append(_MdSplit(
                path=path_stack.copy(),
                level=level,
                header=header,
                content=content,
                token_size=self._token_size(content),
                type='content'
            ))

        return results

`lazyllm.tools.rag.transform.code.CodeSplitter`

Bases: _TextSplitterBase

A code splitter that splits code text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
chunk_overlap (int) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
filetype (Optional[str], default: _UNSET ) –

The file type to split. Defaults to None.
**kwargs –

Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CodeSplitter
>>> splitter = CodeSplitter(filetype='python')

Source code in lazyllm/tools/rag/transform/code.py

class CodeSplitter(_TextSplitterBase):
    """
A code splitter that splits code text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    chunk_overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    filetype (Optional[str]): The file type to split. Defaults to None.
    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> splitter = CodeSplitter(filetype='python')
    """
    _SPLITTER_REGISTRY: Dict[str, Type[_LanguageSplitterBase]] = {
        'xml': XMLSplitter,
        'json': JSONSplitter,
        'jsonl': JSONLSplitter,
        'yaml': YAMLSplitter,
        'yml': YAMLSplitter,
        'html': HTMLSplitter,
        'htm': HTMLSplitter,
    }

    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET,
                 num_workers: int = _UNSET, filetype: Optional[str] = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers)
        self._filetype = self._get_param_value('filetype', filetype, None)
        self._extra_params = kwargs
        self._splitter: Optional[_LanguageSplitterBase] = None

        if self._filetype:
            self._splitter = self.from_language(filetype)

    def sig_fields(self) -> Dict:
        return {'chunk_size': self._chunk_size, 'overlap': self._overlap, 'filetype': self._filetype}

    def from_language(self, filetype: str) -> _LanguageSplitterBase:
        """
Load the language splitter by filetype if not specified in CodeSplitter initialization.

Args:
    filetype (str): The file type to split.
**Returns:**

    _LanguageSplitterBase: The language splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> splitter = CodeSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10)
    >>> splitter = splitter.from_language('python')
    >>> print(splitter)
    """
        filetype_lower = filetype.lower()
        splitter_class = self._SPLITTER_REGISTRY.get(filetype_lower, GeneralCodeSplitter)

        return splitter_class(
            chunk_size=self._chunk_size,
            overlap=self._overlap,
            num_workers=self._number_workers,
            filetype=filetype_lower,
            **self._extra_params
        )

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        if self._splitter is None:
            LOG.warning('Filetype not specified, cannot determine split method')
            return [DocNode(text=node.get_text(), metadata={'tag': 'unknown_type'})]
        return self._splitter.forward(node, **kwargs)

    def split_text(self, text: str, metadata_size: int = 0) -> List[DocNode]:
        """
Split the code text into chunks.

Args:
    text (str): The text to split.
    metadata_size (int): The size of the metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> splitter = CodeSplitter(filetype='python')
    >>> text = 'print("Hello, World!")'
    >>> chunks = splitter.split_text(text)
    >>> print(chunks)
    """
        if self._splitter is None:
            LOG.warning('Filetype not specified, cannot determine split method')
            return [DocNode(text=text, metadata={'tag': 'unknown_type'})]

        return self._splitter.split_text(text, metadata_size)

    @classmethod
    def register_splitter(cls, filetype: str, splitter_class: Type[_LanguageSplitterBase]):
        """
Register a language splitter.

Args:
    filetype (str): The file type to split.
    splitter_class (Type[_LanguageSplitterBase]): The language splitter class.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> CodeSplitter.register_splitter('python', PythonSplitter)
    """
        if not isinstance(splitter_class, type):
            raise TypeError(
                f'splitter_class must be a class, got {type(splitter_class).__name__}'
            )

        if not issubclass(splitter_class, NodeTransform):
            raise TypeError(
                f'splitter_class must be a subclass of NodeTransform, '
                f'got {splitter_class.__name__}'
            )
        if not hasattr(splitter_class, 'split_text'):
            raise TypeError(
                f'splitter_class must have a split_text method, '
                f'got {splitter_class.__name__}'
            )
        if filetype.lower() in cls._SPLITTER_REGISTRY:
            raise ValueError(
                f'splitter for filetype {filetype.lower()} already registered'
            )
        cls._SPLITTER_REGISTRY[filetype.lower()] = splitter_class

    @classmethod
    def get_supported_filetypes(cls) -> List[str]:
        """
Get the supported file types for CodeSplitter.

**Returns:**

    List[str]: The supported file types.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> print(CodeSplitter.get_supported_filetypes())
    """
        return list(cls._SPLITTER_REGISTRY.keys())

`from_language(filetype)`

Load the language splitter by filetype if not specified in CodeSplitter initialization.

Parameters:

filetype (str) –

The file type to split.

Returns:

_LanguageSplitterBase: The language splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CodeSplitter
>>> splitter = CodeSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10)
>>> splitter = splitter.from_language('python')
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

    def from_language(self, filetype: str) -> _LanguageSplitterBase:
        """
Load the language splitter by filetype if not specified in CodeSplitter initialization.

Args:
    filetype (str): The file type to split.
**Returns:**

    _LanguageSplitterBase: The language splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> splitter = CodeSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10)
    >>> splitter = splitter.from_language('python')
    >>> print(splitter)
    """
        filetype_lower = filetype.lower()
        splitter_class = self._SPLITTER_REGISTRY.get(filetype_lower, GeneralCodeSplitter)

        return splitter_class(
            chunk_size=self._chunk_size,
            overlap=self._overlap,
            num_workers=self._number_workers,
            filetype=filetype_lower,
            **self._extra_params
        )

`get_supported_filetypes()` `classmethod`

Get the supported file types for CodeSplitter.

Returns:

List[str]: The supported file types.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CodeSplitter
>>> print(CodeSplitter.get_supported_filetypes())

Source code in lazyllm/tools/rag/transform/code.py

    @classmethod
    def get_supported_filetypes(cls) -> List[str]:
        """
Get the supported file types for CodeSplitter.

**Returns:**

    List[str]: The supported file types.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> print(CodeSplitter.get_supported_filetypes())
    """
        return list(cls._SPLITTER_REGISTRY.keys())

`register_splitter(filetype, splitter_class)` `classmethod`

Register a language splitter.

Parameters:

filetype (str) –

The file type to split.
splitter_class (Type[_LanguageSplitterBase]) –

The language splitter class.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CodeSplitter
>>> CodeSplitter.register_splitter('python', PythonSplitter)

Source code in lazyllm/tools/rag/transform/code.py

    @classmethod
    def register_splitter(cls, filetype: str, splitter_class: Type[_LanguageSplitterBase]):
        """
Register a language splitter.

Args:
    filetype (str): The file type to split.
    splitter_class (Type[_LanguageSplitterBase]): The language splitter class.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> CodeSplitter.register_splitter('python', PythonSplitter)
    """
        if not isinstance(splitter_class, type):
            raise TypeError(
                f'splitter_class must be a class, got {type(splitter_class).__name__}'
            )

        if not issubclass(splitter_class, NodeTransform):
            raise TypeError(
                f'splitter_class must be a subclass of NodeTransform, '
                f'got {splitter_class.__name__}'
            )
        if not hasattr(splitter_class, 'split_text'):
            raise TypeError(
                f'splitter_class must have a split_text method, '
                f'got {splitter_class.__name__}'
            )
        if filetype.lower() in cls._SPLITTER_REGISTRY:
            raise ValueError(
                f'splitter for filetype {filetype.lower()} already registered'
            )
        cls._SPLITTER_REGISTRY[filetype.lower()] = splitter_class

`split_text(text, metadata_size=0)`

Split the code text into chunks.

Parameters:

text (str) –

The text to split.
metadata_size (int, default: 0 ) –

The size of the metadata.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import CodeSplitter
>>> splitter = CodeSplitter(filetype='python')
>>> text = 'print("Hello, World!")'
>>> chunks = splitter.split_text(text)
>>> print(chunks)

Source code in lazyllm/tools/rag/transform/code.py

    def split_text(self, text: str, metadata_size: int = 0) -> List[DocNode]:
        """
Split the code text into chunks.

Args:
    text (str): The text to split.
    metadata_size (int): The size of the metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import CodeSplitter
    >>> splitter = CodeSplitter(filetype='python')
    >>> text = 'print("Hello, World!")'
    >>> chunks = splitter.split_text(text)
    >>> print(chunks)
    """
        if self._splitter is None:
            LOG.warning('Filetype not specified, cannot determine split method')
            return [DocNode(text=text, metadata={'tag': 'unknown_type'})]

        return self._splitter.split_text(text, metadata_size)

`lazyllm.tools.rag.transform.code.HTMLSplitter`

Bases: _LanguageSplitterBase

A HTML splitter that splits HTML text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
keep_sections (bool, default: _UNSET ) –

Whether to keep the sections in the split text. Defaults to False.
keep_tags (bool, default: _UNSET ) –

Whether to keep the tags in the split text. Defaults to False.
**kwargs –

Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import HTMLSplitter
>>> splitter = HTMLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, keep_sections=True, keep_tags=True)

Source code in lazyllm/tools/rag/transform/code.py

class HTMLSplitter(_LanguageSplitterBase):
    """
A HTML splitter that splits HTML text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    keep_sections (bool): Whether to keep the sections in the split text. Defaults to False.
    keep_tags (bool): Whether to keep the tags in the split text. Defaults to False.
    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import HTMLSplitter
    >>> splitter = HTMLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, keep_sections=True, keep_tags=True)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: str = 'html', keep_sections: bool = _UNSET, keep_tags: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, **kwargs)
        keep_sections = self._get_param_value('keep_sections', keep_sections, False)
        keep_tags = self._get_param_value('keep_tags', keep_tags, False)

        self._keep_sections = keep_sections
        self._keep_tags = keep_tags

    def sig_fields(self) -> Dict:
        return {
            'chunk_size': self._chunk_size, 'overlap': self._overlap,
            'keep_sections': self._keep_sections, 'keep_tags': self._keep_tags,
        }

    def _do_split(self, text: str, chunk_size: int) -> List[DocNode]:
        try:
            soup = bs4.BeautifulSoup(text, 'html.parser')
        except Exception as e:
            LOG.warning(f'Failed to parse HTML: {e}. Returning original text as a single DocNode.')
            return [DocNode(text=text, metadata={'filetype': 'html', 'error': str(e)})]

        sections = self._extract_sections(soup)

        if not sections:
            content = self._extract_content(soup)
            return [DocNode(text=content, metadata={'filetype': 'html', 'section_type': 'full_document'})]

        chunks = []
        for sec_info in sections:
            blocks = self._split_by_heading(sec_info)

            for blk_info in blocks:
                content = self._extract_content(blk_info['element'])
                metadata = blk_info['metadata'].copy()
                metadata['filetype'] = 'html'

                if not content.strip():
                    continue

                if self._token_size(content) > chunk_size:
                    splits = self._recursive_splitter.split_text(content, 0)
                    for split in splits:
                        new_node = DocNode(text=split, metadata=metadata.copy())
                        chunks.append(new_node)
                else:
                    chunks.append(DocNode(text=content, metadata=metadata))

        merged_nodes = self._merge(chunks, chunk_size)
        all_nodes = self._sub_split(merged_nodes, chunk_size)
        return all_nodes if all_nodes else [DocNode(text=text, metadata={'filetype': 'html'})]

    def _extract_child_divs(self, parent_elem, parent_metadata: dict) -> List[dict]:
        child_sections = []
        direct_children = []
        for child in parent_elem.children:
            if hasattr(child, 'name') and child.name == 'div':
                direct_children.append(child)

        if len(direct_children) > 1:
            for idx, child_div in enumerate(direct_children):
                child_sections.append({
                    'element': child_div,
                    'metadata': {
                        'section_type': 'div',
                        'section_id': child_div.get('id', ''),
                        'section_class': ' '.join(child_div.get('class', [])) if child_div.get('class') else '',
                        'parent_id': parent_metadata.get('section_id', ''),
                        'child_index': idx,
                    }
                })

        return child_sections

    def _extract_sections(self, soup: 'bs4.BeautifulSoup') -> List[dict]:  # noqa: C901
        sections = []

        semantic_tags = ['section', 'article', 'main', 'header', 'footer', 'aside', 'nav']
        for tag in semantic_tags:
            elements = soup.find_all(tag)
            for elem in elements:
                sections.append({
                    'element': elem,
                    'metadata': {
                        'section_type': tag,
                        'section_id': elem.get('id', ''),
                        'section_class': ' '.join(elem.get('class', [])),
                    }
                })

        if not sections:
            container_patterns = ['container', 'content', 'wrapper', 'main-content',
                                  'page-content', 'article-content', 'post-content']

            wrapper_divs = []
            for pattern in container_patterns:
                divs_by_class = soup.find_all('div', class_=lambda x: x and any(p in str(x).lower() for p in [pattern]))  # noqa B023
                divs_by_id = soup.find_all('div', id=lambda x: x and any(p in str(x).lower() for p in [pattern]))  # noqa B023

                all_divs = list(dict.fromkeys(divs_by_class + divs_by_id))

                for div in all_divs:
                    wrapper_metadata = {
                        'section_type': 'div',
                        'section_id': div.get('id', ''),
                        'section_class': ' '.join(div.get('class', [])),
                        'container_pattern': pattern,
                    }
                    wrapper_divs.append({'element': div, 'metadata': wrapper_metadata})

                if wrapper_divs:
                    break

            if wrapper_divs:
                for wrapper in wrapper_divs:
                    child_divs = self._extract_child_divs(wrapper['element'], wrapper['metadata'])
                    if child_divs:
                        sections.extend(child_divs)
                    else:
                        sections.append(wrapper)

            if not sections:
                body = soup.find('body')
                if body:
                    top_level_divs = body.find_all('div', recursive=False)
                    for div in top_level_divs:
                        if div.get('id') or div.get('class'):
                            sections.append({
                                'element': div,
                                'metadata': {
                                    'section_type': 'div',
                                    'section_id': div.get('id', ''),
                                    'section_class': ' '.join(div.get('class', [])) if div.get('class') else '',
                                }
                            })

        if not sections:
            body = soup.find('body')
            if body:
                sections.append({
                    'element': body,
                    'metadata': {
                        'section_type': 'body',
                        'section_id': body.get('id', ''),
                        'section_class': ' '.join(body.get('class', [])),
                    }
                })
            else:
                sections.append({
                    'element': soup,
                    'metadata': {
                        'section_type': 'document',
                        'section_id': '',
                        'section_class': '',
                    }
                })

        return sections

    def _split_by_heading(self, section: dict) -> List[dict]:
        blocks = []
        section_elem = section['element']
        section_metadata = section['metadata']

        headings = section_elem.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])

        if not headings:
            blocks.append({
                'element': section_elem,
                'metadata': {
                    **section_metadata,
                    'has_heading': False,
                    'heading_level': 0,
                    'heading_text': '',
                }
            })
            return blocks

        heading_parents = [h.parent for h in headings]
        all_same_parent = len(set(id(p) for p in heading_parents)) == 1

        if all_same_parent and len(headings) > 1:
            processed_headings = set()

            for i, heading in enumerate(headings):
                if id(heading) in processed_headings:
                    continue

                heading_level = int(heading.name[1])
                heading_text = heading.get_text(strip=True)

                content_elements = [heading]

                for sibling in heading.find_next_siblings():
                    if sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                        sibling_level = int(sibling.name[1])
                        if sibling_level <= heading_level:
                            break
                    content_elements.append(sibling)

                block_soup = bs4.BeautifulSoup('', 'html.parser')
                for elem in content_elements:
                    if isinstance(elem, bs4.Tag):
                        block_soup.append(copy.copy(elem))

                blocks.append({
                    'element': block_soup,
                    'metadata': {
                        **section_metadata,
                        'has_heading': True,
                        'heading_level': heading_level,
                        'heading_text': heading_text,
                        'heading_id': heading.get('id', ''),
                        'block_index': i,
                    }
                })

                processed_headings.add(id(heading))
        else:
            heading_level = int(headings[0].name[1])
            heading_text = headings[0].get_text(strip=True)

            blocks.append({
                'element': section_elem,
                'metadata': {
                    **section_metadata,
                    'has_heading': True,
                    'heading_level': heading_level,
                    'heading_text': heading_text,
                    'heading_id': headings[0].get('id', ''),
                    'block_index': 0,
                }
            })

        return blocks

    def _extract_content(self, element: 'bs4.BeautifulSoup') -> str:
        if element is None:
            return ''

        for script in element.find_all(['script', 'style']):
            script.decompose()

        text = element.get_text(separator='\n', strip=True)

        lines = [line.strip() for line in text.split('\n')]
        lines = [line for line in lines if line]

        return '\n'.join(lines)

    def _merge(self, chunks: List[DocNode], chunk_size: int) -> List[DocNode]:
        if not chunks or len(chunks) <= 1:
            return chunks

        result = []
        i = 0

        while i < len(chunks):
            current = chunks[i]
            current_size = self._token_size(current.text)

            if i + 1 < len(chunks):
                next_chunk = chunks[i + 1]
                next_size = self._token_size(next_chunk.text)

                can_merge = (
                    current.metadata.get('parent_id') == next_chunk.metadata.get('parent_id')
                    and current.metadata.get('section_type') == next_chunk.metadata.get('section_type')
                    and current_size < chunk_size * 0.5 and current_size + next_size <= chunk_size
                )

                if can_merge:
                    merged_text = current.text + '\n\n' + next_chunk.text
                    merged_metadata = current.metadata.copy()

                    if next_chunk.metadata.get('has_heading') and not current.metadata.get('has_heading'):
                        merged_metadata['has_heading'] = True
                        merged_metadata['heading_text'] = next_chunk.metadata.get('heading_text', '')
                        merged_metadata['heading_level'] = next_chunk.metadata.get('heading_level', 0)

                    merged_node = DocNode(text=merged_text, metadata=merged_metadata)
                    result.append(merged_node)
                    i += 2
                    continue

            result.append(current)
            i += 1

        return result

`lazyllm.tools.rag.transform.code.JSONSplitter`

Bases: _LanguageSplitterBase

A JSON splitter that splits JSON text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
compact_output (bool, default: _UNSET ) –

Whether to compact the output. Defaults to True. **kwargs: Additional parameters passed to the splitter.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import JSONSplitter
>>> splitter = JSONSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

class JSONSplitter(_LanguageSplitterBase):
    """
A JSON splitter that splits JSON text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    compact_output (bool): Whether to compact the output. Defaults to True.    **kwargs: Additional parameters passed to the splitter.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import JSONSplitter
    >>> splitter = JSONSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
    >>> print(splitter)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: str = 'json', compact_output: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, **kwargs)
        compact_output = self._get_param_value('compact_output', compact_output, True)

        self._compact_output = compact_output
        self._max_depth = 20

    def sig_fields(self) -> Dict:
        return {
            'chunk_size': self._chunk_size, 'overlap': self._overlap,
            'filetype': self._filetype, 'compact_output': self._compact_output,
        }

    def _do_split(self, text: str, chunk_size: int) -> List[DocNode]:
        try:
            data = json.loads(text)
        except json.JSONDecodeError as e:
            LOG.warning(f'Failed to parse JSON: {e}. Returning original text as a single DocNode.')
            return [self._make_node(
                text=text,
                filetype=self._filetype,
                error=str(e),
            )]

        return self._split_json_data(data, chunk_size, self._filetype, path=[], depth=0)

    def _make_node(self, text, **meta):
        md = {
            'filetype': self._filetype,
        }
        md.update(meta)
        return DocNode(text=text, metadata=md)

    def _to_json_str(self, data):
        if self._compact_output:
            return json.dumps(data, ensure_ascii=False)
        else:
            return json.dumps(data, ensure_ascii=False, indent=2)

    def _split_json_data(self, data, chunk_size, filetype, path=None, depth=0):
        if path is None:
            path = []

        if depth > self._max_depth:
            LOG.warning(f"Max depth {self._max_depth} reached at path {'/'.join(path)}")
            raw = self._to_json_str(data)
            return [self._make_node(
                text=raw,
                filetype=filetype,
                type=type(data).__name__,
                path='/'.join(path) if path else 'root',
                depth=depth,
                warning='max_depth_exceeded'
            )]

        raw = self._to_json_str(data)
        if self._token_size(raw) <= chunk_size:
            return [self._make_node(
                text=raw,
                filetype=filetype,
                type=type(data).__name__,
                path='/'.join(path) if path else 'root',
                depth=depth,
                is_complete=True
            )]

        if isinstance(data, dict):
            return self._split_dict(data, chunk_size, filetype, path, depth)

        if isinstance(data, list):
            return self._split_list(data, chunk_size, filetype, path, depth)

        if isinstance(data, str):
            return self._split_string(data, chunk_size, filetype, path, depth)

        return [self._make_node(
            text=raw,
            filetype=filetype,
            type=type(data).__name__,
            path='/'.join(path) if path else 'root',
            depth=depth,
            is_complete=True
        )]

    def _split_dict(self, data: Dict, chunk_size, filetype, path, depth):
        nodes = []
        current = {}

        for key, val in data.items():
            test_dict = {**current, key: val}
            test_str = self._to_json_str(test_dict)
            test_size = self._token_size(test_str)

            val_str = self._to_json_str(val)
            val_size = self._token_size(val_str)

            if val_size > chunk_size:
                if current:
                    nodes.append(self._make_node(
                        text=self._to_json_str(current),
                        filetype=filetype,
                        type='dict',
                        path='/'.join(path) if path else 'root',
                        depth=depth,
                        keys=list(current.keys()),
                        is_complete=False
                    ))
                    current = {}

                child_nodes = self._split_json_data(
                    val, chunk_size, filetype,
                    path=path + [key],
                    depth=depth + 1
                )
                for i, node in enumerate(child_nodes):
                    node.metadata['parent_field'] = key
                    if len(child_nodes) > 1:
                        node.metadata['part'] = f'{i + 1}/{len(child_nodes)}'
                nodes.extend(child_nodes)
                continue

            if test_size > chunk_size:
                if current:
                    nodes.append(self._make_node(
                        text=self._to_json_str(current),
                        filetype=filetype,
                        type='dict',
                        path='/'.join(path) if path else 'root',
                        depth=depth,
                        keys=list(current.keys()),
                        is_complete=False
                    ))
                current = {key: val}
            else:
                current[key] = val

        if current:
            nodes.append(self._make_node(
                text=self._to_json_str(current),
                filetype=filetype,
                type='dict',
                path='/'.join(path) if path else 'root',
                depth=depth,
                keys=list(current.keys()),
                is_complete=(len(nodes) == 0)
            ))

        return nodes

    def _split_list(self, data: List, chunk_size, filetype, path, depth):
        nodes = []
        current = []

        for idx, item in enumerate(data):
            test_list = current + [item]
            test_str = self._to_json_str(test_list)
            test_size = self._token_size(test_str)

            if len(current) == 0 and test_size > chunk_size:
                child_nodes = self._split_json_data(
                    item, chunk_size, filetype,
                    path=path + [f'[{idx}]'],
                    depth=depth + 1
                )

                for i, node in enumerate(child_nodes):
                    node.metadata['list_index'] = idx
                    if len(child_nodes) > 1:
                        node.metadata['part'] = f'{i + 1}/{len(child_nodes)}'

                nodes.extend(child_nodes)
                continue

            if test_size > chunk_size:
                if current:
                    nodes.append(self._make_node(
                        text=self._to_json_str(current),
                        filetype=filetype,
                        type='list',
                        path='/'.join(path) if path else 'root',
                        depth=depth,
                        length=len(current),
                        is_complete=False
                    ))
                current = [item]
            else:
                current.append(item)

        if current:
            nodes.append(self._make_node(
                text=self._to_json_str(current),
                filetype=filetype,
                type='list',
                path='/'.join(path) if path else 'root',
                depth=depth,
                length=len(current),
                is_complete=(len(nodes) == 0)
            ))

        return nodes

    def _split_string(self, data, chunk_size, filetype, path, depth):
        data_size = self._token_size(data)

        if data_size <= chunk_size:
            return [self._make_node(
                text=data,
                filetype=filetype,
                type='string',
                path='/'.join(path) if path else 'root',
                depth=depth,
                is_complete=True
            )]

        splits = self._recursive_splitter.split_text(data, metadata_size=0)

        nodes = []
        for i, s in enumerate(splits):
            s_size = self._token_size(s)
            if s_size <= chunk_size:
                nodes.append(self._make_node(
                    text=s,
                    filetype=filetype,
                    type='string',
                    path='/'.join(path) if path else 'root',
                    depth=depth,
                    part=f'{i + 1}/{len(splits)}' if len(splits) > 1 else None,
                    is_complete=(len(splits) == 1)
                ))

        return nodes

`lazyllm.tools.rag.transform.code.JSONLSplitter`

Bases: JSONSplitter

A JSONL splitter that splits JSONL text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
compact_output (bool, default: _UNSET ) –

Whether to compact the output. Defaults to True.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import JSONLSplitter
>>> splitter = JSONLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

class JSONLSplitter(JSONSplitter):
    """
A JSONL splitter that splits JSONL text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    compact_output (bool): Whether to compact the output. Defaults to True.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import JSONLSplitter
    >>> splitter = JSONLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
    >>> print(splitter)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: str = 'jsonl', compact_output: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, compact_output=compact_output, **kwargs)

    def _parse_jsonl_lines(self, text: str) -> List[str]:  # noqa: C901
        lines = text.split('\n')
        result = []
        buffer = []
        braces = brackets = 0
        in_string = escape = False

        for line in lines:
            if not buffer and not line.strip():
                continue

            buffer.append(line)

            for char in line:
                if escape:
                    escape = False
                    continue

                if char == '\\':
                    escape = True
                elif char == '"':
                    in_string = not in_string
                elif not in_string:
                    if char == '{': braces += 1
                    elif char == '}': braces -= 1
                    elif char == '[': brackets += 1
                    elif char == ']': brackets -= 1

            if braces == 0 and brackets == 0 and buffer:
                json_str = '\n'.join(buffer)
                try:
                    obj = json.loads(json_str)
                    result.append(self._to_json_str(obj))
                except json.JSONDecodeError as e:
                    LOG.warning(f'Invalid JSON, skipping: {e}')

                buffer = []
                in_string = escape = False

        if buffer:
            json_str = '\n'.join(buffer)
            LOG.warning(f'Incomplete JSON at end of file: {json_str[:50]}...')

        return result

    def split_text(self, text: str, metadata_size: int) -> List[DocNode]:
        """
Split the JSONL text into chunks.

Args:
    text (str): The text to split.
    metadata_size (int): The size of the metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import JSONLSplitter
    >>> splitter = JSONLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
    >>> text = '{"name": "John", "age": 30}
    {"name": "Jane", "age": 25}'
    >>> chunks = splitter.split_text(text)
    >>> print(chunks)
    """
        if text == '':
            return [DocNode(text='', metadata={'code_type': 'empty'})]
        effective_chunk_size = self._chunk_size - metadata_size
        if effective_chunk_size <= 0:
            raise ValueError(
                f'Metadata length ({metadata_size}) is longer than chunk size '
                f'({self._chunk_size}). Consider increasing the chunk size or '
                'decreasing the size of your metadata to avoid this.'
            )
        elif effective_chunk_size < 50:
            LOG.warning(
                f'Metadata length ({metadata_size}) is close to chunk size '
                f'({self._chunk_size}). Resulting chunks are less than 50 tokens. '
                f'Consider increasing the chunk size or decreasing the size of '
                f'your metadata to avoid this.'
            )

        jsonl_lines = self._parse_jsonl_lines(text)
        nodes = []
        for json_str in jsonl_lines:
            nodes.extend(self._do_split(json_str, effective_chunk_size))

        return nodes if nodes else [DocNode(
            text=text,
            metadata={'filetype': 'jsonl', 'error': 'no_output'}
        )]

`split_text(text, metadata_size)`

Split the JSONL text into chunks.

Parameters:

text (str) –

The text to split.
metadata_size (int) –

The size of the metadata.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import JSONLSplitter
>>> splitter = JSONLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
>>> text = '{"name": "John", "age": 30}
{"name": "Jane", "age": 25}'
>>> chunks = splitter.split_text(text)
>>> print(chunks)

Source code in lazyllm/tools/rag/transform/code.py

    def split_text(self, text: str, metadata_size: int) -> List[DocNode]:
        """
Split the JSONL text into chunks.

Args:
    text (str): The text to split.
    metadata_size (int): The size of the metadata.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import JSONLSplitter
    >>> splitter = JSONLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
    >>> text = '{"name": "John", "age": 30}
    {"name": "Jane", "age": 25}'
    >>> chunks = splitter.split_text(text)
    >>> print(chunks)
    """
        if text == '':
            return [DocNode(text='', metadata={'code_type': 'empty'})]
        effective_chunk_size = self._chunk_size - metadata_size
        if effective_chunk_size <= 0:
            raise ValueError(
                f'Metadata length ({metadata_size}) is longer than chunk size '
                f'({self._chunk_size}). Consider increasing the chunk size or '
                'decreasing the size of your metadata to avoid this.'
            )
        elif effective_chunk_size < 50:
            LOG.warning(
                f'Metadata length ({metadata_size}) is close to chunk size '
                f'({self._chunk_size}). Resulting chunks are less than 50 tokens. '
                f'Consider increasing the chunk size or decreasing the size of '
                f'your metadata to avoid this.'
            )

        jsonl_lines = self._parse_jsonl_lines(text)
        nodes = []
        for json_str in jsonl_lines:
            nodes.extend(self._do_split(json_str, effective_chunk_size))

        return nodes if nodes else [DocNode(
            text=text,
            metadata={'filetype': 'jsonl', 'error': 'no_output'}
        )]

`lazyllm.tools.rag.transform.code.YAMLSplitter`

Bases: JSONSplitter

A YAML splitter that splits YAML text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
compact_output (bool, default: _UNSET ) –

Whether to compact the output. Defaults to True.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import YAMLSplitter
>>> splitter = YAMLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

class YAMLSplitter(JSONSplitter):
    """
A YAML splitter that splits YAML text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    compact_output (bool): Whether to compact the output. Defaults to True.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import YAMLSplitter
    >>> splitter = YAMLSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, compact_output=True)
    >>> print(splitter)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: str = 'yaml', compact_output: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, compact_output=compact_output, **kwargs)

    def _do_split(self, text: str, chunk_size: int) -> List[DocNode]:
        try:
            data = yaml.safe_load(text)
        except yaml.YAMLError as e:
            LOG.warning(f'Failed to parse YAML: {e}. Returning original text as a single DocNode.')
            return [self._make_node(text=text, filetype='yaml', error=str(e))]

        return self._split_json_data(data, chunk_size, 'yaml', path=[], depth=0)

`lazyllm.tools.rag.transform.code.GeneralCodeSplitter`

Bases: _LanguageSplitterBase

A general code splitter that splits code text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
filetype (str, default: 'code' ) –

The file type to split. Defaults to 'code'.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import GeneralCodeSplitter
>>> splitter = GeneralCodeSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, filetype='code')
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

class GeneralCodeSplitter(_LanguageSplitterBase):
    """
A general code splitter that splits code text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    filetype (str): The file type to split. Defaults to 'code'.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import GeneralCodeSplitter
    >>> splitter = GeneralCodeSplitter(chunk_size=1024, chunk_overlap=100, num_workers=10, filetype='code')
    >>> print(splitter)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: str = 'code', **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, **kwargs)

    def sig_fields(self) -> Dict:
        return {'chunk_size': self._chunk_size, 'overlap': self._overlap, 'filetype': self._filetype}

    def _do_split(self, text: str, chunk_size: int) -> List[DocNode]:  # noqa: C901
        if not text.strip():
            return [DocNode(text='', metadata={'code_type': 'empty'})]

        lines = text.split('\n')
        nodes = []

        def _is_code_structure_start(line: str) -> bool:
            stripped = line.strip()
            structure_keywords = [
                'def ', 'class ', 'if ', 'elif ', 'else:', 'for ', 'while ',
                'with ', 'try:', 'except ', 'finally:', 'async def ',
                'namespace ', 'struct ', 'union ', 'enum ', 'function ',
                'public ', 'private ', 'protected '
            ]
            return any(stripped.startswith(kw) for kw in structure_keywords)

        def _get_indent_level(line: str) -> int:
            return len(line) - len(line.lstrip())

        def _create_node(chunk_text: str, chunk_type: str = 'code_block') -> DocNode:
            if not chunk_text.strip():
                return None

            metadata = {
                'code_type': chunk_type,
                'filetype': self._filetype if self._filetype else 'code'
            }

            return DocNode(text=chunk_text, metadata=metadata)

        current_chunk_lines = []
        i = 0

        while i < len(lines):
            line = lines[i]

            if current_chunk_lines and _is_code_structure_start(line):
                chunk_text = '\n'.join(current_chunk_lines)
                node = _create_node(chunk_text, 'code_block')
                if node:
                    nodes.append(node)
                current_chunk_lines = []

            current_chunk_lines.append(line)
            if _is_code_structure_start(line):
                base_indent = _get_indent_level(line)
                structure_lines = [line]
                j = i + 1

                while j < len(lines):
                    next_line = lines[j]
                    next_stripped = next_line.strip()

                    if not next_stripped:
                        structure_lines.append(next_line)
                        j += 1
                        continue

                    next_indent = _get_indent_level(next_line)

                    if next_indent <= base_indent and _is_code_structure_start(next_line):
                        break

                    structure_lines.append(next_line)

                    if len(structure_lines) > 200:
                        break

                    j += 1
                structure_text = '\n'.join(structure_lines)
                node = _create_node(structure_text, 'code_structure')
                if node:
                    nodes.append(node)

                current_chunk_lines = []
                i = j
                continue

            i += 1

            if len(current_chunk_lines) > 100:
                chunk_text = '\n'.join(current_chunk_lines)
                node = _create_node(chunk_text, 'code_block')
                if node:
                    nodes.append(node)
                current_chunk_lines = []

        if current_chunk_lines:
            chunk_text = '\n'.join(current_chunk_lines)
            node = _create_node(chunk_text, 'code_block')
            if node:
                nodes.append(node)

        if not nodes:
            nodes.append(DocNode(
                text=text,
                metadata={
                    'code_type': 'code_file',
                    'filetype': self._filetype if self._filetype else 'code'
                }
            ))

        nodes = self._sub_split(nodes, chunk_size)

        return nodes

`lazyllm.tools.rag.transform.code.XMLSplitter`

Bases: _LanguageSplitterBase

A XML splitter that splits XML text by semantics.

Parameters:

chunk_size (int, default: _UNSET ) –

The size of the chunk after splitting.
overlap (int, default: _UNSET ) –

The length of the overlapping content between two adjacent chunks.
num_workers (int, default: _UNSET ) –

Controls the number of threads or processes used for parallel processing.
keep_trace (bool, default: _UNSET ) –

Whether to keep the trace in the split text. Defaults to False.
keep_tags (bool, default: _UNSET ) –

Whether to keep the tags in the split text. Defaults to False.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import XMLSplitter
>>> splitter = XMLSplitter(chunk_size=1024, overlap=100, num_workers=10, keep_trace=True, keep_tags=True)
>>> print(splitter)

Source code in lazyllm/tools/rag/transform/code.py

class XMLSplitter(_LanguageSplitterBase):
    """
A XML splitter that splits XML text by semantics.

Args:
    chunk_size (int): The size of the chunk after splitting.
    overlap (int): The length of the overlapping content between two adjacent chunks.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    keep_trace (bool): Whether to keep the trace in the split text. Defaults to False.
    keep_tags (bool): Whether to keep the tags in the split text. Defaults to False.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import XMLSplitter
    >>> splitter = XMLSplitter(chunk_size=1024, overlap=100, num_workers=10, keep_trace=True, keep_tags=True)
    >>> print(splitter)
    """
    def __init__(self, chunk_size: int = _UNSET, overlap: int = _UNSET, num_workers: int = _UNSET,
                 filetype: Optional[str] = 'xml', keep_trace: bool = _UNSET, keep_tags: bool = _UNSET, **kwargs):
        super().__init__(chunk_size=chunk_size, overlap=overlap, num_workers=num_workers,
                         filetype=filetype, **kwargs)
        keep_trace = self._get_param_value('keep_trace', keep_trace, False)
        keep_tags = self._get_param_value('keep_tags', keep_tags, False)

        self._keep_trace = keep_trace
        self._keep_tags = keep_tags

    def sig_fields(self) -> Dict:
        return {
            'chunk_size': self._chunk_size, 'overlap': self._overlap,
            'keep_trace': self._keep_trace, 'keep_tags': self._keep_tags,
        }

    def _do_split(self, text: str, chunk_size: int) -> List[DocNode]:  # noqa: C901
        try:
            root = xml.etree.ElementTree.fromstring(text)
        except xml.etree.ElementTree.ParseError as e:
            LOG.warning(f'Failed to parse XML: {e}. Returning original text as a single DocNode.')
            return [DocNode(text=text, metadata={'tag': 'xml_error', 'error': str(e), 'trace': []})]

        def _format_tag_with_attrs(tag_name: str, attributes: dict) -> str:
            if not attributes:
                return tag_name

            attr_strs = []
            for attr_name, attr_value in attributes.items():
                attr_strs.append(f'{attr_name}="{attr_value}"')

            return f'{tag_name} {" ".join(attr_strs)}'

        def _parse_element(element: 'xml.etree.ElementTree.Element', trace: List[str] = None) -> List[DocNode]:
            if trace is None:
                trace = []
            tag_name = element.tag
            attributes = dict(element.attrib) if element.attrib else {}
            tag_with_attrs = _format_tag_with_attrs(tag_name, attributes)
            current_trace = trace + [tag_with_attrs]
            text_content = (element.text or '').strip() if element.text else ''

            metadata = {
                'tag': tag_name,
                'xml_tag': tag_name,
                'trace': current_trace.copy(),
            }

            for attr_name, attr_value in attributes.items():
                metadata[f'attr_{attr_name}'] = attr_value

            if attributes:
                metadata['attributes'] = attributes

            all_nodes = []
            has_children = len(list(element)) > 0
            has_text = bool(text_content)

            if has_text or not has_children:
                node = DocNode(
                    text=text_content if text_content else '',
                    metadata=metadata
                )
                all_nodes.append(node)

            for child_element in element:
                child_nodes = _parse_element(child_element, trace=current_trace)
                all_nodes.extend(child_nodes)

            return all_nodes

        all_nodes = []

        if root.tag and root.tag not in ['', None]:
            root_trace = [_format_tag_with_attrs(root.tag, dict(root.attrib) if root.attrib else {})]
            nodes = _parse_element(root, trace=[])
            all_nodes.extend(nodes)
        else:
            for child in root:
                nodes = _parse_element(child, trace=[])
                all_nodes.extend(nodes)

        if not all_nodes:
            root_tag = root.tag if root.tag else 'root'
            root_attrs = dict(root.attrib) if root.attrib else {}
            root_trace = [_format_tag_with_attrs(root_tag, root_attrs)]
            all_nodes = [DocNode(
                text=text,
                metadata={
                    'tag': root_tag,
                    'xml_tag': root_tag,
                    'trace': root_trace
                }
            )]

        if not self._keep_trace:
            for node in all_nodes:
                node.metadata.pop('trace', None)

        if not self._keep_tags:
            for node in all_nodes:
                node.metadata.pop('tag', None)
                node.metadata.pop('xml_tag', None)

        all_nodes = self._sub_split(all_nodes, chunk_size)
        return all_nodes

`lazyllm.tools.rag.transform.layout.LayoutNodeParser`

Bases: NodeTransform

A layout node parser that parses layout nodes by semantics.

Parameters:

rules (RuleSet, default: None ) –

The rules to apply to the nodes.
group_by (Callable, default: None ) –

The function to group the nodes.
post_process (Callable, default: None ) –

The function to post process the nodes.
sort_by (Callable, default: None ) –

The function to sort the nodes.
return_trace (bool, default: False ) –

Whether to return the trace of the nodes.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import LayoutNodeParser
>>> parser = LayoutNodeParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
>>> parser(nodes)

Source code in lazyllm/tools/rag/transform/layout.py

class LayoutNodeParser(NodeTransform):
    """
A layout node parser that parses layout nodes by semantics.

Args:
    rules (RuleSet): The rules to apply to the nodes.
    group_by (Callable): The function to group the nodes.
    post_process (Callable): The function to post process the nodes.
    sort_by (Callable): The function to sort the nodes.
    return_trace (bool): Whether to return the trace of the nodes.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import LayoutNodeParser
    >>> parser = LayoutNodeParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
    >>> parser(nodes)
    """
    __support_rich__ = True

    def __init__(self, rules: Optional[RuleSet] = None, *, group_by: Optional[Callable[[DocNode], Any]] = None,
                 post_process: Optional[Callable[[List[DocNode]], List[DocNode]]] = None,
                 sort_by: Optional[Callable[[DocNode], Any]] = None, return_trace: bool = False, **kwargs
    ):
        if rules is None:
            rules = _get_simple_layout_rules()
        super().__init__(rules=rules, return_trace=return_trace, **kwargs)
        if group_by is NO_GROUPING:
            self._no_grouping = True
            self._group_by = None
        else:
            self._no_grouping = False
            self._group_by = group_by if group_by is not None else _default_layout_group_key
        self._sort_by = sort_by if sort_by is not None else _default_layout_sort_key
        self._post_process = post_process if post_process is not None else _default_layout_post_process

    def sig_fields(self) -> Dict:
        rules_sig = [
            {'priority': r.priority, 'match_sig': _callable_sig(r.match), 'apply_sig': _callable_sig(r.apply)}
            for r in (self._rules._rules if hasattr(self._rules, '_rules') else [])
        ]
        return {
            'rules_sig': rules_sig,
            'group_by_sig': _callable_sig(self._group_by),
            'sort_by_sig': _callable_sig(self._sort_by),
            'post_process_sig': _callable_sig(self._post_process),
        }

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        nodes = node.nodes if isinstance(node, RichDocNode) else [node]
        result_nodes = []
        if not self._no_grouping:
            nodes = sorted(nodes, key=self._group_by)
            for _key, group in itertools.groupby(nodes, key=self._group_by):
                grouped_nodes = list(group)
                if not grouped_nodes:
                    continue
                grouped_nodes = sorted(grouped_nodes, key=self._sort_by)
                processed = self.process(
                    grouped_nodes,
                    on_match=lambda n, mr, ctx: mr[1],
                    on_miss=lambda n, ctx: n,
                )
                result_nodes.extend(processed)
        else:
            grouped_nodes = sorted(nodes, key=self._sort_by)
            processed = self.process(
                grouped_nodes,
                on_match=lambda n, mr, ctx: mr[1],
                on_miss=lambda n, ctx: n,
            )
            result_nodes.extend(processed)

        return self._post_process(result_nodes)

`lazyllm.tools.rag.transform.contentfilter.ContentFiltParser`

Bases: NodeTransform

A content filter parser that filters the nodes by the rules.

Parameters:

rules (RuleSet, default: None ) –

The rules to apply to the nodes.
num_workers (int, default: 0 ) –

Controls the number of threads or processes used for parallel processing.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import ContentFiltParser
>>> parser = ContentFiltParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
>>> parser(nodes)

Source code in lazyllm/tools/rag/transform/contentfilter.py

class ContentFiltParser(NodeTransform):
    """
A content filter parser that filters the nodes by the rules.

Args:
    rules (RuleSet): The rules to apply to the nodes.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import ContentFiltParser
    >>> parser = ContentFiltParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
    >>> parser(nodes)
    """
    def __init__(self, rules: Optional[RuleSet] = None, num_workers: int = 0, **kwargs):
        rules = rules if rules is not None else DEFAULT_NON_EMPTY_RULE
        super().__init__(num_workers=num_workers, rules=rules, **kwargs)

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        nodes = node.nodes if isinstance(node, RichDocNode) else [node]
        results = self.process(
            nodes,
            on_match=lambda node, match, ctx: node,
            on_miss=lambda node, ctx: None,
        )
        return [x for x in results if x is not None]

`lazyllm.tools.rag.transform.treebuilder.TreeBuilderParser`

Bases: NodeTransform

A tree builder parser that builds a tree from the nodes.

Parameters:

rules (RuleSet, default: None ) –

The rules to apply to the nodes.
get_level (Callable, default: None ) –

The function to get the level of the nodes.
is_valid_child (Callable, default: None ) –

The function to check if a node is a valid child.
return_trace (bool, default: False ) –

Whether to return the trace of the nodes.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import TreeBuilderParser
>>> parser = TreeBuilderParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
>>> parser(nodes)

Source code in lazyllm/tools/rag/transform/treebuilder.py

class TreeBuilderParser(NodeTransform):
    """
A tree builder parser that builds a tree from the nodes.

Args:
    rules (RuleSet): The rules to apply to the nodes.
    get_level (Callable): The function to get the level of the nodes.
    is_valid_child (Callable): The function to check if a node is a valid child.
    return_trace (bool): Whether to return the trace of the nodes.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import TreeBuilderParser
    >>> parser = TreeBuilderParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
    >>> parser(nodes)
    """
    __support_rich__ = True

    def __init__(self, rules: Optional[RuleSet] = None, *, get_level: Optional[Callable[[DocNode], int]] = None,
                 is_valid_child: Optional[Callable[[DocNode, DocNode], bool]] = None, return_trace: bool = False,
                 **kwargs
                 ):
        super().__init__(rules=rules or RuleSet(), return_trace=return_trace, **kwargs)
        self._get_level = get_level or _default_get_level
        self._is_valid_child = is_valid_child or _default_is_valid_child

    def sig_fields(self) -> Dict:
        return {
            'get_level_sig': _callable_sig(self._get_level),
            'is_valid_child_sig': _callable_sig(self._is_valid_child),
        }

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        nodes = node.nodes if isinstance(node, RichDocNode) else [node]
        return self._parse_nodes(nodes, **kwargs)

    def _parse_nodes(self, nodes: List[DocNode], **kwargs: Any) -> List[DocNode]:  # noqa: C901
        if not nodes:
            return []

        root = TreeDocNode(text='root', metadata={'text_level': 0})
        stack = [root]
        normalized_nodes: List[TreeDocNode] = []

        for node in nodes:
            normalized_nodes.append(TreeDocNode.from_doc_node(node))

        for node in normalized_nodes:
            level = self._get_level(node)

            if level == 0:
                stack[-1].add_child(node)
                continue

            target_parent_index = -1

            for i in range(len(stack) - 1, -1, -1):
                parent = stack[i]
                parent_level = self._get_level(parent)

                if parent_level < level:
                    if self._is_valid_child(parent, node):
                        target_parent_index = i
                        break

            if target_parent_index != -1:
                while len(stack) - 1 > target_parent_index:
                    stack.pop()

                stack[-1].add_child(node)
                stack.append(node)
            else:
                while len(stack) > 1:
                    stack.pop()

                stack[0].add_child(node)
                stack.append(node)

        return root.direct_children_in_tree

`lazyllm.tools.rag.transform.treefixer.TreeFixerParser`

Bases: NodeTransform

A tree fixer parser that fixes a tree from the nodes.

Parameters:

rules (RuleSet) –

The rules to apply to the nodes.
skip_level_under (int, default: None ) –

The level to skip the nodes.
extra_patterns (List[Tuple[str, str]], default: None ) –

The extra patterns to apply to the nodes.
return_trace (bool, default: False ) –

Whether to return the trace of the nodes.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import TreeFixerParser
>>> parser = TreeFixerParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
>>> parser(nodes)

Source code in lazyllm/tools/rag/transform/treefixer.py

class TreeFixerParser(NodeTransform):
    """
A tree fixer parser that fixes a tree from the nodes.

Args:
    rules (RuleSet): The rules to apply to the nodes.
    skip_level_under (int): The level to skip the nodes.
    extra_patterns (List[Tuple[str, str]]): The extra patterns to apply to the nodes.
    return_trace (bool): Whether to return the trace of the nodes.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import TreeFixerParser
    >>> parser = TreeFixerParser(rules=RuleSet([Rule(name='rule1', match=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)]))
    >>> parser(nodes)
    """
    __support_rich__ = True

    def __init__(self, patterns: Optional[List[Tuple[str, str]]] = None, *, skip_level_under: Optional[int] = None,
                 extra_patterns: Optional[List[Tuple[str, str]]] = None, return_trace: bool = False, **kwargs
    ):
        rules = RuleSet()
        super().__init__(rules=rules, return_trace=return_trace, **kwargs)

        base = patterns if patterns is not None else _NUMBERING_PATTERNS
        if extra_patterns:
            base = list(base) + list(extra_patterns)
        self._compiled_patterns = [(re.compile(p), fmt) for p, fmt in base]
        self._skip_level_under = skip_level_under if skip_level_under is not None else 1
        self._reset_state()

    def sig_fields(self) -> Dict:
        return {
            'patterns': [(p.pattern, fmt) for p, fmt in self._compiled_patterns],
            'skip_level_under': self._skip_level_under,
        }

    def _reset_state(self) -> None:
        self._result: List[DocNode] = []
        self._stack: List[DocNode] = []
        self._stack_formats: List[Tuple[str, Any]] = []
        self._format_tracker: dict = {}
        self._last_content_parent: Optional[DocNode] = None

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        nodes = node.nodes if isinstance(node, RichDocNode) else [node]
        if not nodes:
            return []
        flat_nodes = self._flatten_nodes(nodes)
        if not flat_nodes:
            return []
        self._reset_state()
        self.process(flat_nodes, self._on_match_handler, self._on_miss_handler)
        self._update_text_levels(self._result)
        return self._result

    def _flatten_nodes(self, nodes: List[DocNode]) -> List[DocNode]:
        def _as_tree_node(node: DocNode) -> TreeDocNode:
            return TreeDocNode.from_doc_node(node)

        result = []
        for node in nodes:
            tree_node = _as_tree_node(node)
            children = self._resolve_tree_nodes(tree_node)
            result.append(tree_node)
            if children:
                result.extend(self._flatten_nodes(children))
        return result

    def _resolve_tree_nodes(self, node: DocNode) -> List[DocNode]:
        if isinstance(node, TreeDocNode):
            return list(node.direct_children_in_tree)
        return []

    def _extract_numbering(self, node: DocNode) -> Tuple[Optional[str], Optional[Any]]:
        if not node or not node.text:
            return None, None
        text = node.text.strip()
        if not text:
            return None, None
        for pattern, fmt in self._compiled_patterns:
            match = pattern.match(text)
            if match:
                try:
                    value = self._parse_value(match.group(1), fmt)
                    return fmt, value
                except (ValueError, IndexError):
                    continue
        return None, None

    def _parse_value(self, raw: str, fmt: str) -> Any:
        if fmt in ('chinese_pause', 'chinese_paren'):
            return self._parse_chinese_number(raw)
        if fmt == 'multilevel':
            return tuple(int(x) for x in raw.split('.'))
        if fmt == 'letter':
            return ord(raw.upper()) - ord('A') + 1
        return int(raw)

    def _parse_chinese_number(self, s: str) -> int:
        if not s:
            return 0
        if len(s) == 1:
            if s in _CHINESE_DIGITS:
                return _CHINESE_DIGITS[s]
            if s == '十':
                return 10
            return 1
        result = 0
        temp = 0
        for char in s:
            if char in _CHINESE_DIGITS:
                temp = _CHINESE_DIGITS[char]
            elif char in _CHINESE_UNITS:
                unit = _CHINESE_UNITS[char]
                if temp == 0:
                    temp = 1
                result += temp * unit
                temp = 0
        return result + temp

    def _is_consecutive(self, prev_val: Any, curr_val: Any, fmt: str) -> bool:
        if prev_val is None or curr_val is None:
            return False
        try:
            if fmt == 'multilevel':
                return self._is_multilevel_consecutive(prev_val, curr_val)
            return curr_val == prev_val + 1
        except (TypeError, ValueError):
            return False

    def _is_multilevel_consecutive(self, prev: tuple, curr: tuple) -> bool:
        if not (isinstance(prev, tuple) and isinstance(curr, tuple)):
            return False
        if len(prev) == len(curr):
            return prev[:-1] == curr[:-1] and curr[-1] == prev[-1] + 1
        if len(curr) == len(prev) + 1:
            return curr[:-1] == prev and curr[-1] == 1
        if len(curr) < len(prev):
            prefix_len = len(curr) - 1
            return prev[:prefix_len] == curr[:prefix_len] and curr[-1] == prev[prefix_len] + 1
        return False

    def _should_skip_numbering(self, node: DocNode) -> bool:
        if not node or not node.text:
            return False
        if not getattr(node, 'metadata', None) or node.metadata is None:
            return False
        return node.metadata.get('text_level', 0) < self._skip_level_under

    def _update_text_levels(self, nodes: List[DocNode], level: int = 1) -> None:
        for node in nodes:
            orig = node.metadata.get('text_level', 1)
            if orig >= 1:
                node.metadata['text_level'] = level
            children = self._resolve_tree_nodes(node)
            if children:
                self._update_text_levels(children, level + 1)

    def _on_match_handler(self, node: DocNode, matched: tuple, ctx: _Context) -> DocNode:
        return node

    def _on_miss_handler(self, node: DocNode, ctx: _Context) -> DocNode:
        if self._should_skip_numbering(node):
            self._attach_non_title_node(node, self._last_content_parent)
            return node

        fmt, val = self._extract_numbering(node)
        if fmt is None:
            self._attach_to_stack_top(node)
            self._last_content_parent = node
            return node

        self._last_content_parent = None
        if fmt not in self._format_tracker:
            self._handle_new_format(node, fmt, val)
            return node

        last_val = self._format_tracker[fmt]
        if self._is_consecutive(last_val, val, fmt):
            self._handle_consecutive(node, fmt, val)
        else:
            self._handle_non_consecutive(node, fmt, val)
        return node

    def _attach_to_stack_top(self, node: DocNode) -> None:
        if self._stack:
            self._add_child(self._stack[-1], node)
        else:
            self._result.append(node)

    def _handle_new_format(self, node: DocNode, fmt: str, val: Any) -> None:
        if self._stack:
            self._add_child(self._stack[-1], node)
        else:
            self._result.append(node)
        self._stack.append(node)
        self._stack_formats.append((fmt, val))
        self._format_tracker[fmt] = val

    def _handle_consecutive(self, node: DocNode, fmt: str, val: Any) -> None:
        if fmt == 'multilevel' and isinstance(val, tuple) and isinstance(self._format_tracker.get(fmt), tuple):
            prev_val = self._format_tracker[fmt]
            if len(val) == len(prev_val) + 1 and val[:-1] == prev_val:
                self._handle_multilevel_child(node, fmt, val)
                return
        idx = self._find_format_index(fmt)
        if idx > 0:
            parent = self._stack[idx - 1]
            self._add_child(parent, node)
            self._stack[:] = self._stack[:idx] + [node]
            self._stack_formats[:] = self._stack_formats[:idx] + [(fmt, val)]
        elif idx == 0:
            self._result.append(node)
            self._stack[:] = [node]
            self._stack_formats[:] = [(fmt, val)]
        else:
            if self._stack:
                self._add_child(self._stack[-1], node)
                self._stack.append(node)
                self._stack_formats.append((fmt, val))
            else:
                self._result.append(node)
                self._stack.append(node)
                self._stack_formats.append((fmt, val))
        self._format_tracker[fmt] = val

    def _handle_non_consecutive(self, node: DocNode, fmt: str, val: Any) -> None:
        idx = self._find_format_index(fmt)
        if idx >= 0:
            self._stack[:] = self._stack[:idx]
            self._stack_formats[:] = self._stack_formats[:idx]
        if self._stack:
            self._add_child(self._stack[-1], node)
        else:
            self._result.append(node)
        self._stack.append(node)
        self._stack_formats.append((fmt, val))
        self._format_tracker[fmt] = val

    def _handle_multilevel_child(self, node: DocNode, fmt: str, val: Any) -> None:
        idx = self._find_format_index(fmt)
        if idx >= 0:
            parent = self._stack[idx]
            self._add_child(parent, node)
            self._stack[:] = self._stack[:idx + 1] + [node]
            self._stack_formats[:] = self._stack_formats[:idx + 1] + [(fmt, val)]
        else:
            if self._stack:
                self._add_child(self._stack[-1], node)
                self._stack.append(node)
                self._stack_formats.append((fmt, val))
            else:
                self._result.append(node)
                self._stack.append(node)
                self._stack_formats.append((fmt, val))
        self._format_tracker[fmt] = val

    def _find_format_index(self, fmt: str) -> int:
        for i in range(len(self._stack_formats) - 1, -1, -1):
            if self._stack_formats[i][0] == fmt:
                return i
        return -1

    def _attach_non_title_node(
        self,
        node: DocNode,
        content_parent: Optional[DocNode] = None,
    ) -> None:
        parent = content_parent if content_parent is not None else (self._stack[-1] if self._stack else None)
        if parent is not None:
            self._add_child(parent, node)
        else:
            self._result.append(node)

    def _add_child(self, parent: DocNode, child: DocNode) -> None:
        if parent is None or child is None:
            return
        tree_parent = TreeDocNode.from_doc_node(parent)
        tree_child = TreeDocNode.from_doc_node(child)
        tree_parent.add_child(tree_child)

`lazyllm.tools.rag.transform.groupby.GroupNodeParser`

Bases: NodeTransform

A group node parser that groups the nodes by the rules.

Parameters:

max_length (int, default: 2048 ) –

The maximum length of the nodes.
merge_title (bool, default: True ) –

Whether to merge the title of the nodes.
num_workers (int, default: 0 ) –

Controls the number of threads or processes used for parallel processing.
return_trace (bool, default: False ) –

Whether to return the trace of the nodes.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import GroupNodeParser
>>> parser = GroupNodeParser(max_length=1024, merge_title=True, num_workers=10)
>>> parser(nodes)

Source code in lazyllm/tools/rag/transform/groupby.py

class GroupNodeParser(NodeTransform):
    """
A group node parser that groups the nodes by the rules.

Args:
    max_length (int): The maximum length of the nodes.
    merge_title (bool): Whether to merge the title of the nodes.
    num_workers (int): Controls the number of threads or processes used for parallel processing.
    return_trace (bool): Whether to return the trace of the nodes.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import GroupNodeParser
    >>> parser = GroupNodeParser(max_length=1024, merge_title=True, num_workers=10)
    >>> parser(nodes)
    """
    __support_rich__ = True

    def __init__(self, num_workers: int = 0, max_length: int = 2048,
                 merge_title: bool = True, return_trace: bool = False, **kwargs):
        rules = RuleSet()
        super().__init__(rules=rules, return_trace=return_trace, **kwargs)

        self.max_length = max_length
        self.merge_title = merge_title
        self._number_workers = num_workers

        self.on_match = self._default_group_handler
        self.on_miss = self._default_group_handler

    def sig_fields(self) -> Dict:
        return {'max_length': self.max_length, 'merge_title': self.merge_title}

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        nodes = node.nodes if isinstance(node, RichDocNode) else [node]
        return self._parse_nodes(nodes, **kwargs)

    def _default_group_handler(self, node, match_result_or_ctx, ctx_or_none=None):
        return node

    def process(self, nodes: List[Any], on_match: Optional[Callable] = None,
                on_miss: Optional[Callable] = None) -> List[Any]:
        """
Process nodes with optional match and miss handlers.

Args:
    nodes (List[Any]): The nodes to process.
    on_match (Optional[Callable]): The function to handle matched nodes.
    on_miss (Optional[Callable]): The function to handle missed nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import GroupNodeParser
    >>> parser = GroupNodeParser(max_length=1024, merge_title=True, num_workers=10)
    >>> nodes = parser.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)
    """
        if on_match is not None or on_miss is not None:
            return super().process(nodes, on_match=on_match, on_miss=on_miss)
        return self._parse_nodes(nodes, max_length=self.max_length, merge_title=self.merge_title)

    def _parse_nodes(self, nodes: List[DocNode], max_length: int = 2048,
                     merge_title: bool = True, **kwargs: Any) -> List[DocNode]:

        def _group_by_level(node_groups: List[List[DocNode]], node: DocNode) -> List[List[DocNode]]:
            text_level = node.metadata.get('text_level', 0)

            if text_level > 0 or not node_groups:
                node_groups.append([node])
            else:
                node_groups[-1].append(node)

            return node_groups

        node_groups = functools.reduce(_group_by_level, nodes, [])

        res = []
        for group in node_groups:
            res.extend(self._process_group(group, max_length=max_length, merge_title=merge_title))
        return res

    def _process_group(self, nodes: List[DocNode], max_length: int = 2048, merge_title: bool = True) -> List[DocNode]:
        if not nodes:
            return []

        title_node, content_nodes = self._split_title_and_content(nodes, merge_title)
        if title_node:
            title_node.metadata['lines'] = self._split_text_into_lines(
                title_node.text, title_node.metadata.get('lines', []))

        if not content_nodes:
            return [title_node] if title_node else []

        total_length = sum(len(node._content) for node in content_nodes)
        title_text = title_node.text if title_node else None

        if total_length <= max_length:
            merged_node = self._merge_nodes(content_nodes)
            if not merged_node:
                return [title_node] if title_node else []
            if title_text:
                merged_node._metadata['title'] = title_text
            return [title_node, merged_node] if title_node else [merged_node]

        groups = self._group_nodes_by_type(content_nodes)
        result = [title_node] if title_node else []

        for group in groups:
            processed_nodes = self._process_node_group(group, max_length, title_text)
            result.extend(processed_nodes)

        return result

    def _group_nodes_by_type(self, nodes: List[DocNode]) -> List[List[DocNode]]:
        attach_types = {'list', 'image', 'code', 'equation'}
        groups = []
        current_group = []

        for node in nodes:
            node_type = node.metadata.get('type', 'text')
            if node_type in attach_types:
                current_group.append(node)
            else:
                if current_group:
                    groups.append(current_group)
                    current_group = []
                if node_type == 'table':
                    groups.append([node])
                else:
                    current_group = [node]

        if current_group:
            groups.append(current_group)

        return groups

    def _process_node_group(self, group: List[DocNode], max_length: int, title_text: str = None) -> List[DocNode]:
        group_length = sum(len(n._content) for n in group)

        if group_length <= max_length:
            merged = self._merge_nodes(group)
            if merged:
                if title_text:
                    merged._metadata['title'] = title_text
                return [merged]
            return []

        result = []
        for node in group:
            node.metadata['lines'] = self._split_text_into_lines(
                node._content, node.metadata.get('lines', []))

            if len(node._content) > max_length:
                split_nodes = self._split_large_node(node, max_length=max_length)
            else:
                split_nodes = [node]

            if title_text:
                for n in split_nodes:
                    n._metadata['title'] = title_text
            result.extend(split_nodes)

        return result

    def _split_title_and_content(self, nodes: List[DocNode], merge_title: bool) -> tuple:
        if merge_title:
            return None, nodes

        first_node = nodes[0]
        if first_node.metadata.get('text_level', 0) > 0:
            return first_node, nodes[1:]
        return None, nodes

    @staticmethod
    def _merge_line_bbox(bbox_a, bbox_b):
        if not bbox_a:
            return bbox_b
        if not bbox_b:
            return bbox_a
        if len(bbox_a) < 4 or len(bbox_b) < 4:
            return bbox_a or bbox_b
        return [
            min(bbox_a[0], bbox_b[0]),
            min(bbox_a[1], bbox_b[1]),
            max(bbox_a[2], bbox_b[2]),
            max(bbox_a[3], bbox_b[3]),
        ]

    @staticmethod
    def _normalize_for_match(content: str) -> str:
        return re.sub(r'\s+', '', content)

    @staticmethod
    def _extract_image_text(para: str) -> str:
        match = re.match(r'!\[([^\]]*)\]\([^\)]+\)', para.strip())
        if match:
            return match.group(1)
        return para

    def _is_content_in_para(self, content: str, para: str) -> bool:
        if not content:
            return True

        para_text = self._extract_image_text(para)

        norm_content = self._normalize_for_match(content.replace('$', ''))
        norm_para = self._normalize_for_match(para_text.replace('$', ''))

        return len(norm_content) <= len(norm_para) and norm_para.startswith(norm_content)

    def _merge_group_lines(self, group_lines: List[dict]) -> dict:
        merged_content = ''.join(line.get('content') or '' for line in group_lines)
        merged_bbox = None
        for line in group_lines:
            merged_bbox = self._merge_line_bbox(merged_bbox, line.get('bbox'))

        return {
            'content': merged_content,
            'bbox': merged_bbox,
            'type': group_lines[0].get('type', 'text'),
            'page': group_lines[0].get('page', 0),
        }

    def _split_text_into_lines(self, text: str, lines: List) -> List:
        original_lines = lines if isinstance(lines, list) else []
        if not text or not original_lines:
            return original_lines or []

        paragraphs = [p for p in text.split('\n') if p.strip()]
        if not paragraphs:
            return original_lines

        result_lines = []
        line_idx = 0

        for para in paragraphs:
            group_lines = []
            accumulated_content = ''

            while line_idx < len(original_lines):
                line = original_lines[line_idx]
                line_content = line.get('content') or ''
                test_content = accumulated_content + line_content

                if self._is_content_in_para(test_content, para):
                    group_lines.append(line)
                    accumulated_content = test_content
                    line_idx += 1
                else:
                    break

            if not group_lines:
                continue

            merged_line = self._merge_group_lines(group_lines)
            result_lines.append(merged_line)

        for line in original_lines[line_idx:]:
            result_lines.append({
                'content': line.get('content') or '',
                'bbox': line.get('bbox'),
                'type': line.get('type', 'text'),
                'page': line.get('page', 0),
            })

        return result_lines

    def _merge_nodes(self, nodes: List[DocNode]) -> DocNode:
        if not nodes:
            return None

        bboxs = []
        contents = []
        all_lines = []

        for node in nodes:
            if not node._content.strip():
                continue

            contents.append(node._content)

            page = node.metadata.get('page', 0)
            bbox = node.metadata.get('bbox', [])
            if bbox:
                bboxs.append([page] + bbox)

            node_lines = node.metadata.get('lines', [])
            if node_lines:
                if isinstance(node_lines, list):
                    all_lines.extend(node_lines)
                else:
                    all_lines.append({
                        'content': node._content,
                        'bbox': node.metadata.get('bbox', []),
                        'type': node.metadata.get('type', 'text'),
                        'page': node.metadata.get('page', 0),
                    })

        if not contents:
            return None

        merged_text = '\n'.join(contents)

        merged_bbox = None
        if bboxs:
            merged_bbox = self._merge_bbox(bboxs)

        merged_metadata = copy.deepcopy(nodes[0].metadata)
        merged_metadata['lines'] = all_lines
        if merged_bbox:
            merged_metadata['bbox'] = merged_bbox

        node = DocNode(text=merged_text, metadata=merged_metadata)

        node.metadata['lines'] = self._split_text_into_lines(
            node.text, node.metadata.get('lines', []))

        return node

    def _merge_bbox(self, bboxs):
        if not bboxs:
            return None

        page_groups = {}
        for bbox_item in bboxs:
            if len(bbox_item) < 5:
                continue
            page = bbox_item[0]
            if page not in page_groups:
                page_groups[page] = []
            page_groups[page].append(bbox_item[1:5])

        if not page_groups:
            return None

        first_page = min(page_groups.keys())
        first_page_bboxs = page_groups[first_page]

        x_mins = [b[0] for b in first_page_bboxs]
        y_mins = [b[1] for b in first_page_bboxs]
        x_maxs = [b[2] for b in first_page_bboxs]
        y_maxs = [b[3] for b in first_page_bboxs]

        return [min(x_mins), min(y_mins), max(x_maxs), max(y_maxs)]

    def _split_large_node(self, node: DocNode, max_length: int = 2048) -> List[DocNode]:
        content = node._content
        result_nodes = []

        lines = content.split('\n')
        current_chunks = []

        for line in lines:
            if len(line) > max_length:
                if current_chunks:
                    result_nodes.append(self._create_split_node(current_chunks, node.metadata))
                    current_chunks = []

                for i in range(0, len(line), max_length):
                    chunk_text = line[i:i + max_length]
                    result_nodes.append(self._create_split_node([chunk_text], node.metadata))
            else:
                test_chunks = current_chunks + [line]
                test_content = '\n'.join(test_chunks)

                if len(test_content) > max_length:
                    if current_chunks:
                        result_nodes.append(self._create_split_node(current_chunks, node.metadata))
                    current_chunks = [line]
                else:
                    current_chunks.append(line)

        if current_chunks:
            result_nodes.append(self._create_split_node(current_chunks, node.metadata))

        return result_nodes

    def _create_split_node(self, current_chunks, metadata):
        content = '\n'.join(current_chunks)

        is_table = metadata.get('type', 'text') == 'table'
        if is_table:
            table_caption = metadata.get('table_caption', '')
            table_footnote = metadata.get('table_footnote', '')
            if table_caption and not content.lstrip().startswith('table_caption'):
                content = f'{table_caption}\n{content}'
            if table_footnote and not content.rstrip().endswith('table_footnote'):
                content = f'{content.rstrip()}\n\n{table_footnote}'

        new_node = DocNode(text=content, metadata=copy.deepcopy(metadata))
        new_node._content = content
        new_node.metadata['lines'] = [
            {
                'content': content,
                'bbox': metadata.get('bbox', []),
                'type': metadata.get('type', 'text'),
                'page': metadata.get('page', 0)
            }
        ]

        return new_node

`process(nodes, on_match=None, on_miss=None)`

Process nodes with optional match and miss handlers.

Parameters:

nodes (List[Any]) –

The nodes to process.
on_match (Optional[Callable], default: None ) –

The function to handle matched nodes.
on_miss (Optional[Callable], default: None ) –

The function to handle missed nodes.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import GroupNodeParser
>>> parser = GroupNodeParser(max_length=1024, merge_title=True, num_workers=10)
>>> nodes = parser.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)

Source code in lazyllm/tools/rag/transform/groupby.py

    def process(self, nodes: List[Any], on_match: Optional[Callable] = None,
                on_miss: Optional[Callable] = None) -> List[Any]:
        """
Process nodes with optional match and miss handlers.

Args:
    nodes (List[Any]): The nodes to process.
    on_match (Optional[Callable]): The function to handle matched nodes.
    on_miss (Optional[Callable]): The function to handle missed nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import GroupNodeParser
    >>> parser = GroupNodeParser(max_length=1024, merge_title=True, num_workers=10)
    >>> nodes = parser.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)
    """
        if on_match is not None or on_miss is not None:
            return super().process(nodes, on_match=on_match, on_miss=on_miss)
        return self._parse_nodes(nodes, max_length=self.max_length, merge_title=self.merge_title)

`lazyllm.tools.rag.transform.base.NodeTransform`

Bases: ModuleBase

Processes document nodes in batch, supporting both single-threaded and multi-threaded modes.

Parameters:

num_workers (int, default: 0 ) –

Controls whether multi-threading is enabled (enabled when >0).
rules (RuleSet, default: None ) –

The rules to apply to the nodes.
return_trace (bool, default: False ) –

Whether to return the trace of the nodes.
**kwargs –

Additional parameters passed to the transformation function.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import NodeTransform
>>> node_tran = NodeTransform(num_workers=num_workers)
>>> doc = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
>>> nodes = node_tran.batch_forward(doc, "word_split")

Source code in lazyllm/tools/rag/transform/base.py

class NodeTransform(ModuleBase):
    """
Processes document nodes in batch, supporting both single-threaded and multi-threaded modes.

Args:
    num_workers (int): Controls whether multi-threading is enabled (enabled when >0).
    rules (RuleSet): The rules to apply to the nodes.
    return_trace (bool): Whether to return the trace of the nodes.
    **kwargs: Additional parameters passed to the transformation function.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import NodeTransform
    >>> node_tran = NodeTransform(num_workers=num_workers)
    >>> doc = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
    >>> nodes = node_tran.batch_forward(doc, "word_split")
    """
    __support_rich__ = False

    # Simple scalar types that are safe to serialize into a signature dict.
    _SIG_SIMPLE_TYPES = (bool, int, float, str)

    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls._uses_default_sig_fields = not any(
            'sig_fields' in klass.__dict__
            for klass in cls.__mro__
            if klass not in (NodeTransform, ModuleBase, object)
        )

    def __init__(self, num_workers: int = 0, rules: Optional['RuleSet'] = None,
                 return_trace: bool = False, **kwargs):
        super().__init__(return_trace=return_trace, **kwargs)
        self._number_workers = num_workers
        self._name = None
        self._rules = rules or RuleSet()
        self._on_match: Optional[Callable[[Any, Tuple['Rule', Any], '_Context'], Any]] = None
        self._on_miss: Optional[Callable[[Any, '_Context'], Any]] = None
        # Only pay the stack-walk cost when no class in the MRO overrides
        # sig_fields() — checked once per class in __init_subclass__.
        self._auto_sig_params: Dict[str, Any] = (
            self._capture_init_args() if type(self)._uses_default_sig_fields else {}
        )

    @staticmethod
    def _is_sig_serializable(val: Any) -> bool:
        if isinstance(val, NodeTransform._SIG_SIMPLE_TYPES):
            return True
        if isinstance(val, (tuple, list)):
            return all(isinstance(v, NodeTransform._SIG_SIMPLE_TYPES) for v in val)
        if isinstance(val, dict):
            return all(
                isinstance(k, str) and isinstance(v, NodeTransform._SIG_SIMPLE_TYPES)
                for k, v in val.items()
            )
        return False

    def _capture_init_args(self) -> Dict[str, Any]:
        _skip = frozenset({'self', 'num_workers', 'return_trace', 'rules', 'kwargs', 'args'})
        result: Dict[str, Any] = {}
        for frame_info in inspect.stack():
            if frame_info.function != '__init__':
                continue
            local_self = frame_info.frame.f_locals.get('self')
            if local_self is not self:
                continue
            owner_cls = frame_info.frame.f_locals.get('__class__')
            if owner_cls is None or not (
                isinstance(owner_cls, type)
                and issubclass(owner_cls, NodeTransform)
                and owner_cls is not NodeTransform
            ):
                continue
            try:
                sig = inspect.signature(owner_cls.__init__)
            except (ValueError, TypeError):
                continue
            locals_ = frame_info.frame.f_locals
            for name, param in sig.parameters.items():
                if name in _skip or name.startswith('_'):
                    continue
                if param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD):
                    continue
                if name not in locals_:
                    continue
                val = locals_[name]
                if self._is_sig_serializable(val):
                    result.setdefault(name, val if not isinstance(val, tuple) else list(val))
        return result

    def sig_fields(self) -> Dict:
        """Return a dict of parameters that affect output content (used for signature computation).

        Subclasses should override this to include all content-affecting constructor parameters.
        Runtime-only parameters (num_workers, return_trace, etc.) must NOT be included.

        The default implementation returns the simple-typed (int, bool, float, str, list/tuple of
        those, dict[str, str]) constructor parameters captured from the actual __init__ call.
        Override for parameters that are non-trivially transformed before storage.
        """
        return dict(self._auto_sig_params)

    def _get_ref_nodes(self, node, ref_path):
        current = [node]
        for key in ref_path:
            current = list(
                chain.from_iterable(
                    n.children.get(key, []) for n in current
                )
            )
        return current

    def batch_forward(  # noqa: C901
        self, documents: Union[DocNode, List[DocNode]], node_group: str, ref_path: List[str] = None, **kwargs
    ) -> List[DocNode]:
        """
Process documents in batch with node group transformation.

Args:
    documents (Union[DocNode, List[DocNode]]): Input node(s) to process.
    node_group (str): Target transformation group name.
    **kwargs: Additional transformation parameters.
"""
        documents: List[DocNode] = documents if isinstance(documents, (tuple, list)) else [documents]

        def impl(node: DocNode):
            with node._lock:
                if node_group in node.children:
                    return []
                if ref_path:
                    ref_nodes = self._get_ref_nodes(node, ref_path)
                    if not ref_nodes:
                        return []
                    forward_kwargs = {**kwargs, 'ref': ref_nodes}
                    if self.__support_rich__:
                        if len(ref_nodes) == 1:
                            splits = self.forward(ref_nodes[0], **forward_kwargs)
                        else:
                            input_node = RichDocNode(nodes=ref_nodes)
                            splits = self.forward(input_node, **forward_kwargs)
                    else:
                        splits = []
                        for n in ref_nodes:
                            splits.extend(self.forward(n, **forward_kwargs))
                else:
                    if isinstance(node, RichDocNode) and not self.__support_rich__:
                        splits = []
                        for sub in node.nodes:
                            splits.extend(self.forward(sub, **kwargs))
                    else:
                        splits = self.forward(node, **kwargs)
                for s in splits:
                    s.parent = node
                    s._group = node_group
                node.children[node_group] = splits
                return splits

        if getattr(self, '_number_workers', 0) > 0:
            with ThreadPoolExecutor(max_workers=self._number_workers) as pool:
                fs = [pool.submit(impl, node) for node in documents]
            return sum([f.result() for f in fs], [])
        else:
            return sum([impl(node) for node in documents], [])

    def forward(self, nodes: DocNode, **kwargs) -> List[DocNode]:
        """
[Abstract] Core transformation logic to implement.

Args:
    nodes (Union[List[DocNode], DocNode]): Input document node(s).
    **kwargs: Implementation-specific parameters.
"""
        if type(self).transform is not NodeTransform.transform:
            if not getattr(self, '_legacy_transform_compat_warned', False):
                LOG.warning(
                    f'[{type(self).__name__}] `transform()` is deprecated. '
                    'Please implement `forward()` instead.'
                )
                self._legacy_transform_compat_warned = True
            return self._normalize_splits(self.transform(nodes, **kwargs))
        raise NotImplementedError(
            'Subclasses must implement forward() to process a single DocNode or RichDocNode'
        )

    @deprecated('forward')
    def transform(self, node: DocNode, **kwargs) -> List[Union[str, DocNode]]:
        """Transform one document node into zero or more output nodes."""
        return self.forward(node, **kwargs)

    def _normalize_splits(self, splits: Any) -> List[DocNode]:
        if splits is None:
            return []
        if not isinstance(splits, (list, tuple)):
            splits = [splits]
        return [s if isinstance(s, DocNode) else DocNode(text=str(s)) for s in splits if s]

    def process(self, nodes: List[Any], on_match: Optional[Callable] = None,
                on_miss: Optional[Callable] = None) -> List[Any]:
        """
Process nodes with optional match and miss handlers.

Args:
    nodes (List[Any]): The nodes to process.
    on_match (Optional[Callable]): The function to handle matched nodes.
    on_miss (Optional[Callable]): The function to handle missed nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import NodeTransform
    >>> node_tran = NodeTransform(num_workers=num_workers)
    >>> doc = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
    >>> nodes = node_tran.batch_forward(doc, "word_split")
    >>> nodes = node_tran.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)
    """
        instance_match = self._on_match
        instance_miss = self._on_miss

        match_handler = (
            on_match if on_match is not None
            else (instance_match if instance_match is not None else self._default_match_handler)
        )
        miss_handler = (
            on_miss if on_miss is not None
            else (instance_miss if instance_miss is not None else self._default_miss_handler)
        )

        ctx = _Context(total=len(nodes))
        results = []

        for i, node in enumerate(nodes):
            ctx.current_idx = i
            match = self._rules.first(node)
            processed = match_handler(node, match, ctx) if match else miss_handler(node, ctx)
            results.append(processed)
            ctx.prev_node, ctx.prev_result = node, processed

        return results

    def _default_match_handler(self, node, matched, ctx):
        return matched[1]

    def _default_miss_handler(self, node, ctx):
        return node

    def with_name(self, name: Optional[str], *, copy: bool = True) -> 'NodeTransform':
        """
Set transformer name with optional copying.

Args:
    name (Optional[str]): New name for the transformer.
    copy (bool): Whether to return a copy. Default True.
"""
        if name is not None:
            if copy: return copy_obj(self).with_name(name, copy=False)
            self._name = name
        return self

    def __call__(self, node_or_nodes: Union[DocNode, List[DocNode]], **kwargs: Any) -> List[DocNode]:
        if isinstance(node_or_nodes, (list, tuple)):
            nodes = node_or_nodes
            for n in nodes:
                if not isinstance(n, DocNode):
                    raise TypeError(
                        f'__call__() expects DocNode objects, got {type(n).__name__} '
                        f'in list. Use forward() directly if you need to process other types.'
                    )
            results = []
            for n in nodes:
                results.extend(self._forward_single(n, **kwargs))
            return results
        if not isinstance(node_or_nodes, DocNode):
            raise TypeError(
                f'__call__() expects DocNode or RichDocNode, got {type(node_or_nodes).__name__}'
            )
        return self._forward_single(node_or_nodes, **kwargs)

    def _forward_single(self, node: Union[DocNode, RichDocNode], **kwargs: Any) -> List[DocNode]:
        if isinstance(node, RichDocNode) and not self.__support_rich__:
            results = []
            for sub in node.nodes:
                results.extend(self.forward(sub, **kwargs))
            return results
        return self.forward(node, **kwargs)

`batch_forward(documents, node_group, ref_path=None, **kwargs)`

Process documents in batch with node group transformation.

Parameters:

documents (Union[DocNode, List[DocNode]]) –

Input node(s) to process.
node_group (str) –

Target transformation group name.
**kwargs –

Additional transformation parameters.

Source code in lazyllm/tools/rag/transform/base.py

    def batch_forward(  # noqa: C901
        self, documents: Union[DocNode, List[DocNode]], node_group: str, ref_path: List[str] = None, **kwargs
    ) -> List[DocNode]:
        """
Process documents in batch with node group transformation.

Args:
    documents (Union[DocNode, List[DocNode]]): Input node(s) to process.
    node_group (str): Target transformation group name.
    **kwargs: Additional transformation parameters.
"""
        documents: List[DocNode] = documents if isinstance(documents, (tuple, list)) else [documents]

        def impl(node: DocNode):
            with node._lock:
                if node_group in node.children:
                    return []
                if ref_path:
                    ref_nodes = self._get_ref_nodes(node, ref_path)
                    if not ref_nodes:
                        return []
                    forward_kwargs = {**kwargs, 'ref': ref_nodes}
                    if self.__support_rich__:
                        if len(ref_nodes) == 1:
                            splits = self.forward(ref_nodes[0], **forward_kwargs)
                        else:
                            input_node = RichDocNode(nodes=ref_nodes)
                            splits = self.forward(input_node, **forward_kwargs)
                    else:
                        splits = []
                        for n in ref_nodes:
                            splits.extend(self.forward(n, **forward_kwargs))
                else:
                    if isinstance(node, RichDocNode) and not self.__support_rich__:
                        splits = []
                        for sub in node.nodes:
                            splits.extend(self.forward(sub, **kwargs))
                    else:
                        splits = self.forward(node, **kwargs)
                for s in splits:
                    s.parent = node
                    s._group = node_group
                node.children[node_group] = splits
                return splits

        if getattr(self, '_number_workers', 0) > 0:
            with ThreadPoolExecutor(max_workers=self._number_workers) as pool:
                fs = [pool.submit(impl, node) for node in documents]
            return sum([f.result() for f in fs], [])
        else:
            return sum([impl(node) for node in documents], [])

`forward(nodes, **kwargs)`

[Abstract] Core transformation logic to implement.

Parameters:

nodes (Union[List[DocNode], DocNode]) –

Input document node(s).
**kwargs –

Implementation-specific parameters.

Source code in lazyllm/tools/rag/transform/base.py

    def forward(self, nodes: DocNode, **kwargs) -> List[DocNode]:
        """
[Abstract] Core transformation logic to implement.

Args:
    nodes (Union[List[DocNode], DocNode]): Input document node(s).
    **kwargs: Implementation-specific parameters.
"""
        if type(self).transform is not NodeTransform.transform:
            if not getattr(self, '_legacy_transform_compat_warned', False):
                LOG.warning(
                    f'[{type(self).__name__}] `transform()` is deprecated. '
                    'Please implement `forward()` instead.'
                )
                self._legacy_transform_compat_warned = True
            return self._normalize_splits(self.transform(nodes, **kwargs))
        raise NotImplementedError(
            'Subclasses must implement forward() to process a single DocNode or RichDocNode'
        )

`process(nodes, on_match=None, on_miss=None)`

Process nodes with optional match and miss handlers.

Parameters:

nodes (List[Any]) –

The nodes to process.
on_match (Optional[Callable], default: None ) –

The function to handle matched nodes.
on_miss (Optional[Callable], default: None ) –

The function to handle missed nodes.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import NodeTransform
>>> node_tran = NodeTransform(num_workers=num_workers)
>>> doc = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
>>> nodes = node_tran.batch_forward(doc, "word_split")
>>> nodes = node_tran.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)

Source code in lazyllm/tools/rag/transform/base.py

    def process(self, nodes: List[Any], on_match: Optional[Callable] = None,
                on_miss: Optional[Callable] = None) -> List[Any]:
        """
Process nodes with optional match and miss handlers.

Args:
    nodes (List[Any]): The nodes to process.
    on_match (Optional[Callable]): The function to handle matched nodes.
    on_miss (Optional[Callable]): The function to handle missed nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import NodeTransform
    >>> node_tran = NodeTransform(num_workers=num_workers)
    >>> doc = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
    >>> nodes = node_tran.batch_forward(doc, "word_split")
    >>> nodes = node_tran.process(nodes, on_match=lambda n, mr, ctx: mr[1], on_miss=lambda n, ctx: n)
    """
        instance_match = self._on_match
        instance_miss = self._on_miss

        match_handler = (
            on_match if on_match is not None
            else (instance_match if instance_match is not None else self._default_match_handler)
        )
        miss_handler = (
            on_miss if on_miss is not None
            else (instance_miss if instance_miss is not None else self._default_miss_handler)
        )

        ctx = _Context(total=len(nodes))
        results = []

        for i, node in enumerate(nodes):
            ctx.current_idx = i
            match = self._rules.first(node)
            processed = match_handler(node, match, ctx) if match else miss_handler(node, ctx)
            results.append(processed)
            ctx.prev_node, ctx.prev_result = node, processed

        return results

`sig_fields()`

Return a dict of parameters that affect output content (used for signature computation).

Subclasses should override this to include all content-affecting constructor parameters. Runtime-only parameters (num_workers, return_trace, etc.) must NOT be included.

The default implementation returns the simple-typed (int, bool, float, str, list/tuple of those, dict[str, str]) constructor parameters captured from the actual init call. Override for parameters that are non-trivially transformed before storage.

Source code in lazyllm/tools/rag/transform/base.py

def sig_fields(self) -> Dict:
    """Return a dict of parameters that affect output content (used for signature computation).

    Subclasses should override this to include all content-affecting constructor parameters.
    Runtime-only parameters (num_workers, return_trace, etc.) must NOT be included.

    The default implementation returns the simple-typed (int, bool, float, str, list/tuple of
    those, dict[str, str]) constructor parameters captured from the actual __init__ call.
    Override for parameters that are non-trivially transformed before storage.
    """
    return dict(self._auto_sig_params)

`transform(node, **kwargs)`

Transform one document node into zero or more output nodes.

Source code in lazyllm/tools/rag/transform/base.py

@deprecated('forward')
def transform(self, node: DocNode, **kwargs) -> List[Union[str, DocNode]]:
    """Transform one document node into zero or more output nodes."""
    return self.forward(node, **kwargs)

`with_name(name, *, copy=True)`

Set transformer name with optional copying.

Parameters:

name (Optional[str]) –

New name for the transformer.
copy (bool, default: True ) –

Whether to return a copy. Default True.

Source code in lazyllm/tools/rag/transform/base.py

    def with_name(self, name: Optional[str], *, copy: bool = True) -> 'NodeTransform':
        """
Set transformer name with optional copying.

Args:
    name (Optional[str]): New name for the transformer.
    copy (bool): Whether to return a copy. Default True.
"""
        if name is not None:
            if copy: return copy_obj(self).with_name(name, copy=False)
            self._name = name
        return self

`lazyllm.tools.rag.transform.base.Rule` `dataclass`

A rule to apply to the nodes.

Parameters:

name (str) –

The name of the rule.
match (Callable) –

The function to match the nodes.
apply (Callable) –

The function to apply to the nodes.
priority (int, default: 0 ) –

The priority of the rule.
metadata (Dict[str, Any], default: dict() ) –

The metadata of the rule.

Source code in lazyllm/tools/rag/transform/base.py

@dataclass(frozen=True)
class Rule:
    """
A rule to apply to the nodes.

Args:
    name (str): The name of the rule.
    match (Callable): The function to match the nodes.
    apply (Callable): The function to apply to the nodes.
    priority (int): The priority of the rule.
    metadata (Dict[str, Any]): The metadata of the rule.
"""
    name: str
    match: Callable[[Any], Any]
    apply: Callable[..., Any]
    priority: int = 0
    metadata: Dict[str, Any] = field(default_factory=dict)

    def __call__(self, data: Any) -> Optional[Any]:
        match_result = self.match(data)
        if not match_result:
            return None
        try:
            sig = inspect.signature(self.apply)
            params = [
                p for p in sig.parameters.values()
                if p.kind in (
                    inspect.Parameter.POSITIONAL_ONLY,
                    inspect.Parameter.POSITIONAL_OR_KEYWORD
                )
            ]
            if len(params) >= 3:
                return self.apply(data, match_result, self)
        except (ValueError, TypeError):
            pass
        return self.apply(data, self)

    @staticmethod
    def build(name: str, rule: Union[str, Callable[[Any], bool]],
              apply: Callable[[Any, 'Rule'], Any], priority: int = 0) -> 'Rule':
        """
Build a rule from a pattern string or a predicate function.

Args:
    name (str): The name of the rule.
    rule (Union[str, Callable[[Any], bool]]): The pattern string or the predicate function.
    apply (Callable[[Any, 'Rule'], Any]): The function to apply to the nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Rule
    >>> rule = Rule.build(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)
    >>> print(rule)
    """
        if isinstance(rule, str):
            compiled = re.compile(rule)
            return Rule(
                name=name,
                match=lambda text: compiled.search(text),
                apply=lambda text, match_result, r: apply(match_result, text),
                priority=priority,
            )
        if callable(rule):
            return Rule(
                name=name,
                match=lambda data: True if rule(data) else None,
                apply=lambda data, _match_result, r: apply(data, r),
                priority=priority,
            )
        raise TypeError('rule must be a pattern string or a predicate callable')

`build(name, rule, apply, priority=0)` `staticmethod`

Build a rule from a pattern string or a predicate function.

Parameters:

name (str) –

The name of the rule.
rule (Union[str, Callable[[Any], bool]]) –

The pattern string or the predicate function.
apply (Callable[[Any, Rule], Any]) –

The function to apply to the nodes.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import Rule
>>> rule = Rule.build(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)
>>> print(rule)

Source code in lazyllm/tools/rag/transform/base.py

    @staticmethod
    def build(name: str, rule: Union[str, Callable[[Any], bool]],
              apply: Callable[[Any, 'Rule'], Any], priority: int = 0) -> 'Rule':
        """
Build a rule from a pattern string or a predicate function.

Args:
    name (str): The name of the rule.
    rule (Union[str, Callable[[Any], bool]]): The pattern string or the predicate function.
    apply (Callable[[Any, 'Rule'], Any]): The function to apply to the nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import Rule
    >>> rule = Rule.build(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)
    >>> print(rule)
    """
        if isinstance(rule, str):
            compiled = re.compile(rule)
            return Rule(
                name=name,
                match=lambda text: compiled.search(text),
                apply=lambda text, match_result, r: apply(match_result, text),
                priority=priority,
            )
        if callable(rule):
            return Rule(
                name=name,
                match=lambda data: True if rule(data) else None,
                apply=lambda data, _match_result, r: apply(data, r),
                priority=priority,
            )
        raise TypeError('rule must be a pattern string or a predicate callable')

`lazyllm.tools.rag.transform.base.RuleSet`

A set of rules to apply to the nodes.

Parameters:

rules (List[Rule], default: None ) –

The rules to apply to the nodes.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RuleSet
>>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
>>> print(rules)

Source code in lazyllm/tools/rag/transform/base.py

class RuleSet:
    """
A set of rules to apply to the nodes.

Args:
    rules (List[Rule]): The rules to apply to the nodes.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> print(rules)
    """
    def __init__(self, rules: Optional[List[Rule]] = None):
        self._rules: List[Rule] = []
        if rules:
            self.extend(rules)

    def add(self, *rules: Rule) -> 'RuleSet':
        """
Add a rule to the rule set.

Args:
    *rules (Rule): The rules to add to the rule set.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> rules.add(Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n))
    >>> print(rules)
    """
        self._rules.extend(rules)
        self._sort()
        return self

    def extend(self, rules: List[Rule]) -> 'RuleSet':
        """
Extend the rule set with another rule set.

Args:
    rules (RuleSet): The rule set to extend.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> rules.extend(RuleSet([Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n)]))
    >>> print(rules)
    """
        self._rules.extend(rules)
        self._sort()
        return self

    def _sort(self):
        self._rules.sort(key=lambda r: r.priority, reverse=True)

    def first(self, data: Any) -> Optional[tuple[Rule, Any]]:
        """
Get the first rule that matches the data.

Args:
    data (Any): The data to match.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> print(rules.first(Node(text='Hello World')))
    """
        for rule in self._rules:
            result = rule(data)
            if result is not None:
                return (rule, result)
        return None

    def all(self, data: Any) -> List[tuple[Rule, Any]]:
        """
Get all the rules that match the data.

Args:
    data (Any): The data to match.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> print(rules.all(Node(text='Hello World')))
    """
        results = []
        for rule in self._rules:
            result = rule(data)
            if result is not None:
                results.append((rule, result))
        return results

    def filter(self, predicate: Callable[[Rule], bool]) -> 'RuleSet':
        """
Filter the rule set with a predicate function.

Args:
    predicate (Callable[[Rule], bool]): The predicate function to filter the rule set.
"""
        return RuleSet([r for r in self._rules if predicate(r)])

    def __iter__(self) -> Iterator[Rule]:
        return iter(self._rules)

    def __len__(self) -> int:
        return len(self._rules)

    def __bool__(self) -> bool:
        return bool(self._rules)

`add(*rules)`

Add a rule to the rule set.

Parameters:

*rules (Rule, default: () ) –

The rules to add to the rule set.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RuleSet
>>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
>>> rules.add(Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n))
>>> print(rules)

Source code in lazyllm/tools/rag/transform/base.py

    def add(self, *rules: Rule) -> 'RuleSet':
        """
Add a rule to the rule set.

Args:
    *rules (Rule): The rules to add to the rule set.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> rules.add(Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n))
    >>> print(rules)
    """
        self._rules.extend(rules)
        self._sort()
        return self

`all(data)`

Get all the rules that match the data.

Parameters:

data (Any) –

The data to match.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RuleSet
>>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
>>> print(rules.all(Node(text='Hello World')))

Source code in lazyllm/tools/rag/transform/base.py

    def all(self, data: Any) -> List[tuple[Rule, Any]]:
        """
Get all the rules that match the data.

Args:
    data (Any): The data to match.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> print(rules.all(Node(text='Hello World')))
    """
        results = []
        for rule in self._rules:
            result = rule(data)
            if result is not None:
                results.append((rule, result))
        return results

`extend(rules)`

Extend the rule set with another rule set.

Parameters:

rules (RuleSet) –

The rule set to extend.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RuleSet
>>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
>>> rules.extend(RuleSet([Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n)]))
>>> print(rules)

Source code in lazyllm/tools/rag/transform/base.py

    def extend(self, rules: List[Rule]) -> 'RuleSet':
        """
Extend the rule set with another rule set.

Args:
    rules (RuleSet): The rule set to extend.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> rules.extend(RuleSet([Rule(name='rule2', rule=lambda n: n.text.endswith('World'), apply=lambda n, r: n)]))
    >>> print(rules)
    """
        self._rules.extend(rules)
        self._sort()
        return self

`filter(predicate)`

Filter the rule set with a predicate function.

Parameters:

predicate (Callable[[Rule], bool]) –

The predicate function to filter the rule set.

Source code in lazyllm/tools/rag/transform/base.py

    def filter(self, predicate: Callable[[Rule], bool]) -> 'RuleSet':
        """
Filter the rule set with a predicate function.

Args:
    predicate (Callable[[Rule], bool]): The predicate function to filter the rule set.
"""
        return RuleSet([r for r in self._rules if predicate(r)])

`first(data)`

Get the first rule that matches the data.

Parameters:

data (Any) –

The data to match.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import RuleSet
>>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
>>> print(rules.first(Node(text='Hello World')))

Source code in lazyllm/tools/rag/transform/base.py

    def first(self, data: Any) -> Optional[tuple[Rule, Any]]:
        """
Get the first rule that matches the data.

Args:
    data (Any): The data to match.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import RuleSet
    >>> rules = RuleSet([Rule(name='rule1', rule=lambda n: n.text.startswith('Hello'), apply=lambda n, r: n)])
    >>> print(rules.first(Node(text='Hello World')))
    """
        for rule in self._rules:
            result = rule(data)
            if result is not None:
                return (rule, result)
        return None

`lazyllm.tools.rag.transform.factory.LLMParser`

Bases: NodeTransform

A text summarizer and keyword extractor that is responsible for analyzing the text input by the user and providing concise summaries or extracting relevant keywords based on the requested task.

Parameters:

llm (TrainableModule) –

A trainable module.
language (str) –

The language type, currently only supports Chinese (zh) and English (en).
task_type (str) –

Currently supports two types of tasks: summary and keyword extraction.
num_workers (int, default: 30 ) –

Controls the number of threads or processes used for parallel processing.

Examples:

>>> from lazyllm import TrainableModule
>>> from lazyllm.tools.rag import LLMParser
>>> llm = TrainableModule("internlm2-chat-7b")
>>> summary_parser = LLMParser(llm, language="en", task_type="summary")

Source code in lazyllm/tools/rag/transform/factory.py

class LLMParser(NodeTransform):
    """
A text summarizer and keyword extractor that is responsible for analyzing the text input by the user and providing concise summaries or extracting relevant keywords based on the requested task.

Args:
    llm (TrainableModule): A trainable module.
    language (str): The language type, currently only supports Chinese (zh) and English (en).
    task_type (str): Currently supports two types of tasks: summary and keyword extraction.
    num_workers (int): Controls the number of threads or processes used for parallel processing.


Examples:

    >>> from lazyllm import TrainableModule
    >>> from lazyllm.tools.rag import LLMParser
    >>> llm = TrainableModule("internlm2-chat-7b")
    >>> summary_parser = LLMParser(llm, language="en", task_type="summary")
    """
    supported_languages = {'en': 'English', 'zh': 'Chinese'}

    def __init__(self, llm: LLMBase, language: str, task_type: str,
                 prompts: Optional[LLMTransformParserPrompts] = None, num_workers: int = 30):
        super(__class__, self).__init__(num_workers=num_workers)
        assert language in self.supported_languages, f'Not supported language {language}'
        assert task_type in ['summary', 'keywords', 'qa', 'qa_img'], f'Not supported task_type {task_type}'
        self._task_type = task_type
        self._prompts = prompts or LLMTransformParserPrompts()
        task_prompt_tempalte = getattr(self._prompts, self._task_type)
        task_prompt = task_prompt_tempalte.format(language=self.supported_languages[language])
        if self._task_type == 'qa_img':
            prompt = dict(system=task_prompt, user='{input}')
        else:
            prompt = dict(system=task_prompt, user='#input:\n{input}\n#output:\n')
        self._llm = llm.share(prompt=AlpacaPrompter(prompt), stream=False, format=self._format)
        self._task_type = task_type
        self._language = language

    def sig_fields(self) -> Dict:
        prompts_sig = '__default__'
        if self._prompts is not None:
            try:
                prompts_sig = hashlib.sha256(
                    json.dumps(self._prompts.__dict__, sort_keys=True).encode()
                ).hexdigest()[:16]
            except TypeError:
                prompts_sig = repr(self._prompts)
        return {
            'llm_sig': type(self._llm).__name__,
            'language': self._language,
            'task_type': self._task_type,
            'prompts_sig': prompts_sig,
        }

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """
Perform the set task on the specified document.

Args:
    node (DocNode): The document on which the extraction task needs to be performed.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import LLMParser
    >>> llm = lazyllm.TrainableModule("internlm2-chat-7b").start()
    >>> m = lazyllm.TrainableModule("bge-large-zh-v1.5").start()
    >>> summary_parser = LLMParser(llm, language="en", task_type="summary")
    >>> keywords_parser = LLMParser(llm, language="en", task_type="keywords")
    >>> documents = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
    >>> rm = lazyllm.Retriever(documents, group_name='CoarseChunk', similarity='bm25', topk=6)
    >>> doc_nodes = rm("test")
    >>> summary_result = summary_parser.forward(doc_nodes[0])
    >>> keywords_result = keywords_parser.forward(doc_nodes[0])
    """
        if self._task_type == 'qa_img':
            inputs = encode_query_with_filepaths('Extract QA pairs from images.', [node.image_path])
        else:
            inputs = node.get_text()
        chunks = self._llm(inputs)
        chunks = [chunks] if isinstance(chunks, str) else chunks
        return [c if isinstance(c, DocNode) else DocNode(text=str(c)) for c in chunks if c]

    def _format(self, input):
        if isinstance(input, dict):
            input = input.get('output', input.get('text', input.get('content', str(input))))

        if not isinstance(input, str):
            input = str(input)

        if self._task_type == 'keywords':
            return [s.strip() for s in input.split(',')]
        elif self._task_type in ('qa', 'qa_img'):
            return [QADocNode(query=q.strip()[3:].strip(), answer=a.strip()[3:].strip()) for q, a in zip(
                list(filter(None, map(str.strip, input.split('\n'))))[::2],
                list(filter(None, map(str.strip, input.split('\n'))))[1::2])]
        return input

`forward(node, **kwargs)`

Perform the set task on the specified document.

Parameters:

node (DocNode) –

The document on which the extraction task needs to be performed.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import LLMParser
>>> llm = lazyllm.TrainableModule("internlm2-chat-7b").start()
>>> m = lazyllm.TrainableModule("bge-large-zh-v1.5").start()
>>> summary_parser = LLMParser(llm, language="en", task_type="summary")
>>> keywords_parser = LLMParser(llm, language="en", task_type="keywords")
>>> documents = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
>>> rm = lazyllm.Retriever(documents, group_name='CoarseChunk', similarity='bm25', topk=6)
>>> doc_nodes = rm("test")
>>> summary_result = summary_parser.forward(doc_nodes[0])
>>> keywords_result = keywords_parser.forward(doc_nodes[0])

Source code in lazyllm/tools/rag/transform/factory.py

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """
Perform the set task on the specified document.

Args:
    node (DocNode): The document on which the extraction task needs to be performed.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools import LLMParser
    >>> llm = lazyllm.TrainableModule("internlm2-chat-7b").start()
    >>> m = lazyllm.TrainableModule("bge-large-zh-v1.5").start()
    >>> summary_parser = LLMParser(llm, language="en", task_type="summary")
    >>> keywords_parser = LLMParser(llm, language="en", task_type="keywords")
    >>> documents = lazyllm.Document(dataset_path="/path/to/your/data", embed=m, manager=False)
    >>> rm = lazyllm.Retriever(documents, group_name='CoarseChunk', similarity='bm25', topk=6)
    >>> doc_nodes = rm("test")
    >>> summary_result = summary_parser.forward(doc_nodes[0])
    >>> keywords_result = keywords_parser.forward(doc_nodes[0])
    """
        if self._task_type == 'qa_img':
            inputs = encode_query_with_filepaths('Extract QA pairs from images.', [node.image_path])
        else:
            inputs = node.get_text()
        chunks = self._llm(inputs)
        chunks = [chunks] if isinstance(chunks, str) else chunks
        return [c if isinstance(c, DocNode) else DocNode(text=str(c)) for c in chunks if c]

`lazyllm.tools.rag.transform.factory.TransformArgs` `dataclass`

A document transformation parameter container for centralized management of processing configurations.

Parameters:

f (Union[str, Callable]) –

Transformation function or registered function name.Can be either a callable function or a string identifier for registered functions.
trans_node (bool, default: None ) –

Whether to transform node types.When True, modifies the document node structure during processing.
num_workers (int, default: 0 ) –

Controls parallel processing threads.Values >0.
kwargs (Dict, default: dict() ) –

Additional parameters passed to the transformation function.
pattern (Union[str, Callable[[str], bool]], default: None ) –

File name/content matching pattern.

Examples:

>>> from lazyllm.tools import TransformArgs
>>> args = TransformArgs(f=lambda text: text.lower(),num_workers=4,pattern=r'.*\.md$')
>>>config = {'f': 'parse_pdf','kwargs': {'engine': 'pdfminer'},'trans_node': True}
>>>args = TransformArgs.from_dict(config)
print(args['f'])
print(args.get('unknown'))

Source code in lazyllm/tools/rag/transform/factory.py

@dataclass
class TransformArgs():
    """
A document transformation parameter container for centralized management of processing configurations.

Args:
    f (Union[str, Callable]): Transformation function or registered function name.Can be either a callable function or a string identifier for registered functions.
    trans_node (bool): Whether to transform node types.When True, modifies the document node structure during processing.
    num_workers (int):Controls parallel processing threads.Values >0.
    kwargs (Dict):Additional parameters passed to the transformation function.
    pattern (Union[str, Callable[[str], bool]]):File name/content matching pattern.


Examples:

    >>> from lazyllm.tools import TransformArgs
    >>> args = TransformArgs(f=lambda text: text.lower(),num_workers=4,pattern=r'.*\\.md$')
    >>>config = {'f': 'parse_pdf','kwargs': {'engine': 'pdfminer'},'trans_node': True}
    >>>args = TransformArgs.from_dict(config)
    print(args['f'])
    print(args.get('unknown'))
    """
    f: Union[str, Callable]
    trans_node: Optional[bool] = None
    num_workers: int = 0
    kwargs: Dict = field(default_factory=dict)
    pattern: Optional[Union[str, Callable[[str], bool]]] = None
    name: Optional[str] = None  # explicit name for signature (useful for lambdas)

    @staticmethod
    def from_dict(d):
        return TransformArgs(f=d['f'], trans_node=d.get('trans_node'), num_workers=d.get(
            'num_workers', 0), kwargs=d.get('kwargs', dict()), pattern=d.get('pattern'),
            name=d.get('name'))

    def __getitem__(self, key):
        if key in self.__dict__: return getattr(self, key)
        raise KeyError(f'Key {key} is not found in transform args')

    def get(self, key):
        if key in self.__dict__: return getattr(self, key)
        return None

    def signature(self) -> str:
        f = self.f
        kw = self.kwargs or {}
        cls = None
        if isinstance(f, str):
            cls = _transmap.get(f.lower())
            type_name = f
        elif inspect.isclass(f):
            cls = f
            type_name = f.__name__
        else:
            if isinstance(f, NodeTransform):
                # Use a nested 'params' key so that a sig_fields() dict containing
                # a 'type' key never collides with the top-level type discriminator.
                sig_dict = {'type': type(f).__name__, 'params': f.sig_fields()}
                if self.pattern is not None:
                    sig_dict['pattern'] = _callable_sig(self.pattern) if callable(self.pattern) else self.pattern
                return _calculate_signature(sig_dict)
            return _calculate_signature({'type': '__callable__', 'func': _callable_sig(f, self.name),
                                         'trans_node': self.trans_node})

        instance = cls(**kw) if cls is not None else None
        if instance is not None:
            sig_dict = {'type': type_name, 'params': instance.sig_fields()}
        else:
            sig_dict = {'type': type_name}
        if self.pattern is not None:
            sig_dict['pattern'] = _callable_sig(self.pattern) if callable(self.pattern) else self.pattern
        return _calculate_signature(sig_dict)

`lazyllm.tools.rag.transform.factory.FuncNodeTransform`

Bases: NodeTransform

A wrapper class for user-defined functions that transforms document nodes.

This wrapper supports two modes of operation

When trans_node is False (default): transforms text strings
When trans_node is True: transforms DocNode objects

The wrapper can handle various function signatures

str -> List[str]: transform=lambda t: t.split('\n')
str -> str: transform=lambda t: t[:3]
DocNode -> List[DocNode]: pipeline(lambda x:x, SentenceSplitter)
DocNode -> DocNode: pipeline(LLMParser)

Parameters:

func (Union[Callable[[str], List[str]], Callable[[DocNode], List[DocNode]]]) –

The user-defined function to be wrapped.
trans_node (bool, default: None ) –

Determines whether the function operates on DocNode objects (True) or text strings (False). Defaults to None.
num_workers (int, default: 0 ) –

Controls the number of threads or processes used for parallel processing. Defaults to 0.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag import FuncNodeTransform
>>> from lazyllm.tools import Document, SentenceSplitter

# Example 1: Text-based transformation (trans_node=False)
>>> def split_by_comma(text):
...     return text.split(',')
>>> text_transform = FuncNodeTransform(split_by_comma, trans_node=False)

# Example 2: Node-based transformation (trans_node=True)
>>> def custom_node_transform(node):
...     # Process the DocNode and return a list of DocNodes
...     return [node]  # Simple pass-through
>>> node_transform = FuncNodeTransform(custom_node_transform, trans_node=True)

# Example 3: Using with Document
>>> m = lazyllm.OnlineEmbeddingModule(source="glm")
>>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
>>> documents.create_node_group(name="custom", transform=text_transform)

Source code in lazyllm/tools/rag/transform/factory.py

class FuncNodeTransform(NodeTransform):
    """
A wrapper class for user-defined functions that transforms document nodes.

This wrapper supports two modes of operation:
    1. When trans_node is False (default): transforms text strings
    2. When trans_node is True: transforms DocNode objects

The wrapper can handle various function signatures:
    - str -> List[str]: transform=lambda t: t.split('\\\\n')
    - str -> str: transform=lambda t: t[:3]
    - DocNode -> List[DocNode]: pipeline(lambda x:x, SentenceSplitter)
    - DocNode -> DocNode: pipeline(LLMParser)

Args:
    func (Union[Callable[[str], List[str]], Callable[[DocNode], List[DocNode]]]): The user-defined function to be wrapped.
    trans_node (bool, optional): Determines whether the function operates on DocNode objects (True) or text strings (False). Defaults to None.
    num_workers (int): Controls the number of threads or processes used for parallel processing. Defaults to 0.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools.rag import FuncNodeTransform
    >>> from lazyllm.tools import Document, SentenceSplitter

    # Example 1: Text-based transformation (trans_node=False)
    >>> def split_by_comma(text):
    ...     return text.split(',')
    >>> text_transform = FuncNodeTransform(split_by_comma, trans_node=False)

    # Example 2: Node-based transformation (trans_node=True)
    >>> def custom_node_transform(node):
    ...     # Process the DocNode and return a list of DocNodes
    ...     return [node]  # Simple pass-through
    >>> node_transform = FuncNodeTransform(custom_node_transform, trans_node=True)

    # Example 3: Using with Document
    >>> m = lazyllm.OnlineEmbeddingModule(source="glm")
    >>> documents = Document(dataset_path='your_doc_path', embed=m, manager=False)
    >>> documents.create_node_group(name="custom", transform=text_transform)
    """
    def __init__(self, func: Union[Callable[[str], List[str]], Callable[[DocNode], List[DocNode]]],
                 trans_node: bool = None, num_workers: int = 0):
        super(__class__, self).__init__(num_workers=num_workers)
        self._func, self._trans_node = func, trans_node
        self._need_ref = 'ref' in inspect.signature(func).parameters

    def sig_fields(self) -> Dict:
        return {'func_sig': _callable_sig(self._func), 'trans_node': self._trans_node}

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """
Transform a document node using the wrapped user-defined function.

This method applies the user-defined function to either the text content of the node (when trans_node=False) or the node itself (when trans_node=True).

Args:
    node (DocNode): The document node to be transformed.
    **kwargs: Additional keyword arguments passed to the transformation function.

**Returns:**

- List[Union[str, DocNode]]: The transformed results, which can be either strings or DocNode objects depending on the function implementation.
"""
        if ref := kwargs.get('ref', None):
            assert self._need_ref, 'if node group has ref, the transform function must support ref parameter.'
            kwargs['ref'] = ref if self._trans_node else [r.get_text() for r in ref]
        chunks = self._func(node if self._trans_node else node.get_text(), **kwargs)
        chunks = chunks if isinstance(chunks, list) else [chunks]
        return [c if isinstance(c, DocNode) else DocNode(text=str(c)) for c in chunks if c]

`forward(node, **kwargs)`

Transform a document node using the wrapped user-defined function.

This method applies the user-defined function to either the text content of the node (when trans_node=False) or the node itself (when trans_node=True).

Parameters:

node (DocNode) –

The document node to be transformed.
**kwargs –

Additional keyword arguments passed to the transformation function.

Returns:

List[Union[str, DocNode]]: The transformed results, which can be either strings or DocNode objects depending on the function implementation.

Source code in lazyllm/tools/rag/transform/factory.py

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """
Transform a document node using the wrapped user-defined function.

This method applies the user-defined function to either the text content of the node (when trans_node=False) or the node itself (when trans_node=True).

Args:
    node (DocNode): The document node to be transformed.
    **kwargs: Additional keyword arguments passed to the transformation function.

**Returns:**

- List[Union[str, DocNode]]: The transformed results, which can be either strings or DocNode objects depending on the function implementation.
"""
        if ref := kwargs.get('ref', None):
            assert self._need_ref, 'if node group has ref, the transform function must support ref parameter.'
            kwargs['ref'] = ref if self._trans_node else [r.get_text() for r in ref]
        chunks = self._func(node if self._trans_node else node.get_text(), **kwargs)
        chunks = chunks if isinstance(chunks, list) else [chunks]
        return [c if isinstance(c, DocNode) else DocNode(text=str(c)) for c in chunks if c]

`lazyllm.tools.rag.transform.factory.AdaptiveTransform`

Bases: NodeTransform

A flexible document transformation system that applies different transforms based on document patterns.

AdaptiveTransform allows you to define multiple transformation strategies and automatically selects the appropriate one based on the document's file path or custom pattern matching. This is particularly useful when you have different types of documents that require different processing approaches.

Parameters:

transforms (Union[List[Union[TransformArgs, Dict]], Union[TransformArgs, Dict]]) –

A list of transform configurations or a single transform configuration.
num_workers (int, default: 0 ) –

Number of worker threads for parallel processing. Defaults to 0.

Examples:

>>> from lazyllm.tools.rag.transform import AdaptiveTransform, DocNode, SentenceSplitter
>>> doc1 = DocNode(text="这是第一个文档的内容。它包含多个句子。")
>>> doc2 = DocNode(text="这是第二个文档的内容。")
>>> transforms = [
...     {
...         'f': SentenceSplitter,
...         'pattern': '*.txt',
...         'kwargs': {'chunk_size': 50, 'chunk_overlap': 10}
...     },
...     {
...         'f': SentenceSplitter,
...         'pattern': '*.pdf',
...         'kwargs': {'chunk_size': 100, 'chunk_overlap': 20}
...     }
... ]
>>> adaptive = AdaptiveTransform(transforms)
>>> results1 = adaptive.forward(doc1)
>>> print(f"文档1转换结果: {len(results1)} 个块")
>>> for i, result in enumerate(results1):
...     print(f"  块 {i+1}: {result.text}")
>>> results2 = adaptive.forward(doc2)
>>> print(f"文档2转换结果: {len(results2)} 个块")
>>> for i, result in enumerate(results2):
...     print(f"  块 {i+1}: {result.text}")

Source code in lazyllm/tools/rag/transform/factory.py

class AdaptiveTransform(NodeTransform):
    """A flexible document transformation system that applies different transforms based on document patterns.

AdaptiveTransform allows you to define multiple transformation strategies and automatically selects the appropriate one based on the document's file path or custom pattern matching. This is particularly useful when you have different types of documents that require different processing approaches.

Args:
    transforms (Union[List[Union[TransformArgs, Dict]], Union[TransformArgs, Dict]]): A list of transform configurations or a single transform configuration. 
    num_workers (int, optional): Number of worker threads for parallel processing. Defaults to 0.


Examples:
    >>> from lazyllm.tools.rag.transform import AdaptiveTransform, DocNode, SentenceSplitter
    >>> doc1 = DocNode(text="这是第一个文档的内容。它包含多个句子。")
    >>> doc2 = DocNode(text="这是第二个文档的内容。")
    >>> transforms = [
    ...     {
    ...         'f': SentenceSplitter,
    ...         'pattern': '*.txt',
    ...         'kwargs': {'chunk_size': 50, 'chunk_overlap': 10}
    ...     },
    ...     {
    ...         'f': SentenceSplitter,
    ...         'pattern': '*.pdf',
    ...         'kwargs': {'chunk_size': 100, 'chunk_overlap': 20}
    ...     }
    ... ]
    >>> adaptive = AdaptiveTransform(transforms)
    >>> results1 = adaptive.forward(doc1)
    >>> print(f"文档1转换结果: {len(results1)} 个块")
    >>> for i, result in enumerate(results1):
    ...     print(f"  块 {i+1}: {result.text}")
    >>> results2 = adaptive.forward(doc2)
    >>> print(f"文档2转换结果: {len(results2)} 个块")
    >>> for i, result in enumerate(results2):
    ...     print(f"  块 {i+1}: {result.text}")      
    """
    def __init__(self, transforms: Union[List[Union[TransformArgs, Dict]], Union[TransformArgs, Dict]],
                 num_workers: int = 0):
        super().__init__(num_workers=num_workers)
        if not isinstance(transforms, (tuple, list)): transforms = [transforms]
        self._raw_transforms = [t if isinstance(t, TransformArgs) else TransformArgs.from_dict(t) for t in transforms]
        self._transformers = [(t.get('pattern'), make_transform(t)) for t in self._raw_transforms]

    def sig_fields(self) -> Dict:
        entries = []
        for t in self._raw_transforms:
            entry = {'sig': t.signature()}
            entries.append(entry)
        return {'transforms': entries}

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """Transform a document using the appropriate transformation strategy based on pattern matching.

This method evaluates each transform configuration in order and applies the first one that matches the document's path pattern. The matching logic supports both glob patterns and custom callable functions.

Args:
    document (DocNode): The document node to be transformed.
    **kwargs: Additional keyword arguments passed to the transform function.

**Returns:**

- List[Union[str, DocNode]]: A list of transformed results (strings or DocNode objects).
"""
        if not isinstance(node, DocNode):
            LOG.warning(f'Invalid document type {type(node)} got')
            return []
        for pt, transform in self._transformers:
            if pt and isinstance(pt, str) and not pt.startswith('*'):
                pt = os.path.join(os.getcwd(), pt)
            if not pt or (callable(pt) and pt(node.docpath)) or (
                    isinstance(pt, str) and fnmatch.fnmatch(node.docpath, pt)):
                chunks = transform(node, **kwargs)
                return list(chunks) if not isinstance(chunks, list) else chunks
        LOG.warning(f'No transform found for document {node.docpath} with group name `{self._name}`')
        return []

`forward(node, **kwargs)`

Transform a document using the appropriate transformation strategy based on pattern matching.

This method evaluates each transform configuration in order and applies the first one that matches the document's path pattern. The matching logic supports both glob patterns and custom callable functions.

Parameters:

document (DocNode) –

The document node to be transformed.
**kwargs –

Additional keyword arguments passed to the transform function.

Returns:

List[Union[str, DocNode]]: A list of transformed results (strings or DocNode objects).

Source code in lazyllm/tools/rag/transform/factory.py

    def forward(self, node: DocNode, **kwargs) -> List[DocNode]:
        """Transform a document using the appropriate transformation strategy based on pattern matching.

This method evaluates each transform configuration in order and applies the first one that matches the document's path pattern. The matching logic supports both glob patterns and custom callable functions.

Args:
    document (DocNode): The document node to be transformed.
    **kwargs: Additional keyword arguments passed to the transform function.

**Returns:**

- List[Union[str, DocNode]]: A list of transformed results (strings or DocNode objects).
"""
        if not isinstance(node, DocNode):
            LOG.warning(f'Invalid document type {type(node)} got')
            return []
        for pt, transform in self._transformers:
            if pt and isinstance(pt, str) and not pt.startswith('*'):
                pt = os.path.join(os.getcwd(), pt)
            if not pt or (callable(pt) and pt(node.docpath)) or (
                    isinstance(pt, str) and fnmatch.fnmatch(node.docpath, pt)):
                chunks = transform(node, **kwargs)
                return list(chunks) if not isinstance(chunks, list) else chunks
        LOG.warning(f'No transform found for document {node.docpath} with group name `{self._name}`')
        return []

`lazyllm.tools.rag.similarity.register_similarity(func=None, mode=None, descend=True, batch=False)`

Similarity computation registration decorator, used for unified registration and management of different types of similarity computation methods.

Parameters:

func (Callable, default: None ) –

The name of the similarity computation function.
mode (Literal['text', 'embedding'], default: None ) –

'text' indicates direct text matching, while 'embedding' indicates vector-based similarity computation.
descend (bool, default: True ) –

Controls whether multithreading is enabled (enabled when > 0).
kwargs (Dict) –

Whether the results are sorted in descending order of similarity.
batch (bool, default: False ) –

Whether to process nodes in batch.

Source code in lazyllm/tools/rag/similarity.py

def register_similarity(
    func: Optional[Callable] = None,
    mode: Optional[Literal['text', 'embedding']] = None,
    descend: bool = True,
    batch: bool = False,
) -> Callable:
    """
Similarity computation registration decorator, used for unified registration and management of different types of similarity computation methods.

Args:
    func (Callable): The name of the similarity computation function.
    mode (Literal['text', 'embedding']): 'text' indicates direct text matching, while 'embedding' indicates vector-based similarity computation.
    descend (bool): Controls whether multithreading is enabled (enabled when > 0).
    kwargs (Dict): Whether the results are sorted in descending order of similarity.
    batch (bool): Whether to process nodes in batch.
"""
    def decorator(f):
        @functools.wraps(f)
        def wrapper(query, nodes, **kwargs):
            if mode != 'embedding':
                if batch:
                    return f(query, nodes, **kwargs)
                else:
                    return [(node, f(query, node, **kwargs)) for node in nodes]
            else:
                assert isinstance(query, dict), 'query must be of dict type, used for similarity calculation.'
                similarity = {}
                if batch:
                    for key, val in query.items():
                        nodes_embed = [node.embedding[key] for node in nodes]
                        similarity[key] = [(node, sim) for node, sim in zip(nodes, f(val, nodes_embed, **kwargs))]
                else:
                    for key, val in query.items():
                        similarity[key] = [(node, f(val, node.embedding[key], **kwargs)) for node in nodes]
                return similarity
        registered_similarities[f.__name__] = (wrapper, mode, descend)
        return wrapper

    return decorator(func) if func else decorator

`lazyllm.tools.rag.doc_node.DocNode`

Bases: DocNodeCore

Execute assigned tasks on the specified document.

Parameters:

uid (str, default: None ) –

Unique identifier.
content (Union[str, List[Any]], default: None ) –

Node content.
group (str, default: None ) –

Document group name.
embedding (Dict[str, List[float]], default: None ) –

Dictionary of embedding vectors.
parent (Union[str, DocNode], default: None ) –

Reference to the parent node.
store –

Storage representation.
node_groups (Dict[str, Dict], default: None ) –

Node storage groups.
metadata (Dict[str, Any], default: None ) –

Node-level metadata.
global_metadata (Dict[str, Any], default: None ) –

Document-level metadata.
text (str, default: None ) –

Node content, mutually exclusive with content.

Source code in lazyllm/tools/rag/doc_node.py

@reset_on_pickle(('_lock', threading.Lock))
class DocNode(DocNodeCore):
    """
Execute assigned tasks on the specified document.

Args:
    uid (str): Unique identifier.
    content (Union[str, List[Any]]): Node content.
    group (str): Document group name.
    embedding (Dict[str, List[float]]): Dictionary of embedding vectors.
    parent (Union[str, "DocNode"]): Reference to the parent node.
    store: Storage representation.
    node_groups (Dict[str, Dict]): Node storage groups.
    metadata (Dict[str, Any]): Node-level metadata.
    global_metadata (Dict[str, Any]): Document-level metadata.
    text (str): Node content, mutually exclusive with content.
"""
    def __init__(self, uid: Optional[str] = None, content: Optional[Union[str, List[Any]]] = None,
                 group: Optional[str] = None, embedding: Optional[Dict[str, List[float]]] = None,
                 parent: Optional[Union[str, 'DocNode']] = None, store=None,
                 node_groups: Optional[Dict[str, Dict]] = None, metadata: Optional[Dict[str, Any]] = None,
                 global_metadata: Optional[Dict[str, Any]] = None, *, text: Optional[str] = None):
        if text and content:
            raise ValueError('`text` and `content` cannot be set at the same time.')
        if not content and not text: content = ''
        self._content: Optional[Union[str, List[Any]]] = content if content is not None else text
        text_for_base = self._content if isinstance(self._content, str) else (text or '')
        super().__init__(text=text_for_base, metadata=metadata, uid=uid)
        self._group: Optional[str] = group
        self._embedding: Optional[Dict[str, List[float]]] = embedding or {}
        # Global metadata: the file's global metadata (higher level)
        self._global_metadata = global_metadata or {}
        # NOTE: node in parent should be id when stored in db (use store to recover): parent: 'uid'
        self._parent: Optional[Union[str, 'DocNode']] = parent
        self._children: Dict[str, List['DocNode']] = defaultdict(list)
        self._children_loaded = False
        self._store = store
        self._node_groups: Dict[str, Dict] = node_groups or {}
        self._lock = threading.Lock()
        self.embedding_state = set()
        self.relevance_score = None
        self.similarity_score = None
        self._content_hash: Optional[str] = None
        self._copy_source: Optional[dict] = None

    @property
    def group(self) -> str:
        return self._group

    @property
    def content(self) -> Union[str, List[Any]]:
        return self._content

    @content.setter
    def content(self, value: Union[str, List[Any]]) -> None:
        self._content = value
        self._content_hash = None

    @property
    def number(self) -> int:
        return self._metadata.get('lazyllm_store_num', 0)

    @number.setter
    def number(self, value: int) -> None:
        self._metadata['lazyllm_store_num'] = value

    @property
    def text(self) -> str:
        if isinstance(self._content, str):
            return self._content
        elif isinstance(self._content, list):
            if unexcepted := set([type(ele) for ele in self._content if not isinstance(ele, str)]):
                raise TypeError(f'Found non-string element in content: {unexcepted}')
            return '\n'.join(self._content)
        else:
            raise TypeError(f'content type "{type(self._content)}" is neither a str nor a list')

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:
        """
Combine metadata and content.

Args:
    metadata_mode: Same as the parameter in get_metadata_str.
"""
        metadata_str = self.get_metadata_str(metadata_mode).strip()
        if not metadata_str:
            return self.text if self.text else ''
        return f'{metadata_str}\n\n{self.text}'.strip()

    @property
    def content_hash(self) -> str:
        if self._content_hash is None:
            self._content_hash = hashlib.sha256(self.text.encode('utf-8')).hexdigest()
        return self._content_hash

    @property
    def embedding(self):
        return self._embedding

    @embedding.setter
    def embedding(self, v: Optional[Dict[str, List[float]]]):
        self._embedding = v

    def _load_from_store(self, group_name: str, uids: Union[str, List[str]]) -> List['DocNode']:
        if not self._store or not uids:
            return []
        if isinstance(uids, str):
            uids = [uids]
        nodes = self._store.get_nodes(group=group_name, uids=uids,
                                      kb_id=self.global_metadata.get(RAG_KB_ID), display=True)
        for n in nodes:
            n._store = self._store
            n._node_groups = self._node_groups
        return nodes

    @property
    def parent(self) -> Optional['DocNode']:
        if self._parent and isinstance(self._parent, str) and self._node_groups:
            parent_group = self._node_groups[self._group]['parent']
            loaded = self._load_from_store(parent_group, self._parent)
            self._parent = loaded[0] if loaded else None
        return self._parent

    @parent.setter
    def parent(self, v: Optional['DocNode']):
        self._parent = v

    @property
    def children(self) -> Dict[str, List['DocNode']]:
        if not self._children_loaded and self._store and self._node_groups:
            self._children_loaded = True
            kb_id = self.global_metadata.get(RAG_KB_ID)
            doc_id = self.global_metadata.get(RAG_DOC_ID)
            c_groups = [grp for grp in self._node_groups.keys() if self._node_groups[grp]['parent'] == self._group]
            for grp in c_groups:
                if not self._store.is_group_active(grp):
                    continue
                nodes = self._store.get_nodes(group=grp, kb_id=kb_id, doc_ids=[doc_id])
                c_nodes = [n for n in nodes if n._parent in {self, self._uid}]
                self._children[grp] = c_nodes
                for n in self._children[grp]:
                    n._store = self._store
                    n._node_groups = self._node_groups
        return self._children

    @children.setter
    def children(self, v: Dict[str, List['DocNode']]):
        self._children = v

    @property
    def root_node(self) -> 'DocNode':
        node = self
        while isinstance(node._parent, DocNode):
            node = node._parent
        return node

    @property
    def is_root_node(self) -> bool:
        return (not self.parent)

    @property
    def global_metadata(self) -> Dict[str, Any]:
        return self.root_node._global_metadata

    @global_metadata.setter
    def global_metadata(self, global_metadata: Dict) -> None:
        self._global_metadata = global_metadata

    @property
    def metadata(self) -> Dict:
        return self._metadata

    @metadata.setter
    def metadata(self, metadata: Dict) -> None:
        self._metadata = metadata

    @property
    def excluded_embed_metadata_keys(self) -> List:
        return list(set(self.root_node._excluded_embed_metadata_keys + self._excluded_embed_metadata_keys))

    @excluded_embed_metadata_keys.setter
    def excluded_embed_metadata_keys(self, excluded_embed_metadata_keys: List) -> None:
        self._excluded_embed_metadata_keys = excluded_embed_metadata_keys

    @property
    def excluded_llm_metadata_keys(self) -> List:
        return list(set(self.root_node._excluded_llm_metadata_keys + self._excluded_llm_metadata_keys))

    @excluded_llm_metadata_keys.setter
    def excluded_llm_metadata_keys(self, excluded_llm_metadata_keys: List) -> None:
        self._excluded_llm_metadata_keys = excluded_llm_metadata_keys

    @property
    def docpath(self) -> str:
        return self.root_node.global_metadata.get(RAG_DOC_PATH, '')

    @docpath.setter
    def docpath(self, path):
        assert not self.parent, 'Only root node can set docpath'
        self.global_metadata[RAG_DOC_PATH] = str(path)

    def get_children_str(self) -> str:
        """Get string representation of child nodes.

**Returns:**

- str: Returns a string representing a dictionary where keys are group names and values are lists of child node UIDs in that group.
"""
        return str(
            {key: [node._uid for node in nodes] for key, nodes in self.children.items()}
        )

    def get_parent_id(self) -> str:
        """Get the unique identifier of the parent node.

**Returns:**

- str: Returns the parent node's UID, or an empty string if there is no parent node.
"""
        return self.parent._uid if self.parent else ''

    def __str__(self) -> str:
        return (
            f'DocNode(id: {self._uid}, group: {self._group}, content: {self._content}) parent: {self.get_parent_id()}, '
            f'children: {self.get_children_str()}'
        )

    def __repr__(self) -> str:
        return str(self) if config['mode'] == Mode.Debug else f'<Node id={self._uid}>'

    def __eq__(self, other):
        if isinstance(other, DocNode):
            return self._uid == other._uid
        return False

    def __hash__(self):
        return hash(self._uid)

    def __getstate__(self):
        st = self.__dict__.copy()
        for attr in _pickle_blacklist:
            st[attr] = None
        return st

    def has_missing_embedding(self, embed_keys: Union[str, List[str]]) -> List[str]:
        """
Check for missing embedding vectors.

Args:
    embed_keys (Union[str, List[str]]): List of target keys.
"""
        if isinstance(embed_keys, str): embed_keys = [embed_keys]
        assert len(embed_keys) > 0, 'The embed_keys to be checked must be passed in.'
        if self.embedding is None: return embed_keys
        return [k for k in embed_keys if k not in self.embedding]

    def do_embedding(self, embed: Dict[str, Callable]) -> None:
        """
Execute embedding computation.

Args:
    embed (Dict[str, Callable]): Target embedding objects.
"""
        generate_embed = {k: e(self.get_text(MetadataMode.EMBED)) for k, e in embed.items()}
        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding = {**self.embedding, **generate_embed}

    def set_embedding(self, embed_key, embed_value) -> None:
        """Set embedding vector for document node.

Set the embedding vector value for specified key in document node, used for subsequent retrieval and similarity calculation.

Args:
    embed_key (str): Key name of the embedding vector
    embed_value: Value of the embedding vector

Returns:
    None
"""
        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding[embed_key] = embed_value

    def check_embedding_state(self, embed_key: str) -> None:
        """
Block to check the embedding status and ensure that asynchronous embedding computation is completed.

Args:
    embed_key (str): List of target keys.
"""
        while True:
            with self._lock:
                if not self.has_missing_embedding(embed_key):
                    self.embedding_state.discard(embed_key)
                    break
            time.sleep(1)

    def get_content(self) -> str:
        """Get the node's content text with metadata in LLM mode.

**Returns:**

- str: Returns the node's text content with formatted metadata information according to LLM mode.
"""
        return self.get_text(MetadataMode.LLM)

    def to_dict(self) -> Dict:
        """
Convert to dictionary format
"""
        return dict(content=self._content, embedding=self.embedding, metadata=self.metadata)

    def copy(self, global_metadata: dict = None, metadata: dict = None,
             preserve_uid: bool = False) -> 'DocNode':
        """
Copy the current DocNode and generate a new uid.

The copied node records its source (_copy_source) and can optionally merge metadata/global_metadata.

Args:
    global_metadata (dict): Fields to merge into global_metadata
    metadata (dict): Fields to merge into metadata

Returns:
    DocNode: The copied node
"""
        node = copy.copy(self)
        if not preserve_uid:
            node._uid = str(uuid.uuid4())
        node._metadata = dict(self._metadata or {})
        node._global_metadata = dict(self._global_metadata or {})

        node._copy_source = {'uid': self.uid, RAG_KB_ID: self.global_metadata.get(RAG_KB_ID),
                             RAG_DOC_ID: self.global_metadata.get(RAG_DOC_ID)}
        if metadata:
            node._metadata.update(metadata)
        if global_metadata:
            node._global_metadata.update(global_metadata)
        return node

    def with_score(self, score):
        """
Shallow copy the original node and add a semantic relevance score.

Args:
    score: Relevance score.
"""
        node = self.copy(preserve_uid=True)
        node.relevance_score = score
        return node

    def with_sim_score(self, score):
        """
Shallow copy the original node and add a similarity score.

Args:
    score: Similarity score.
"""
        node = self.copy(preserve_uid=True)
        node.similarity_score = score
        return node

`check_embedding_state(embed_key)`

Block to check the embedding status and ensure that asynchronous embedding computation is completed.

Parameters:

embed_key (str) –

List of target keys.

Source code in lazyllm/tools/rag/doc_node.py

    def check_embedding_state(self, embed_key: str) -> None:
        """
Block to check the embedding status and ensure that asynchronous embedding computation is completed.

Args:
    embed_key (str): List of target keys.
"""
        while True:
            with self._lock:
                if not self.has_missing_embedding(embed_key):
                    self.embedding_state.discard(embed_key)
                    break
            time.sleep(1)

`copy(global_metadata=None, metadata=None, preserve_uid=False)`

Copy the current DocNode and generate a new uid.

The copied node records its source (_copy_source) and can optionally merge metadata/global_metadata.

Parameters:

global_metadata (dict, default: None ) –

Fields to merge into global_metadata
metadata (dict, default: None ) –

Fields to merge into metadata

Returns:

DocNode ( DocNode ) –

The copied node

Source code in lazyllm/tools/rag/doc_node.py

    def copy(self, global_metadata: dict = None, metadata: dict = None,
             preserve_uid: bool = False) -> 'DocNode':
        """
Copy the current DocNode and generate a new uid.

The copied node records its source (_copy_source) and can optionally merge metadata/global_metadata.

Args:
    global_metadata (dict): Fields to merge into global_metadata
    metadata (dict): Fields to merge into metadata

Returns:
    DocNode: The copied node
"""
        node = copy.copy(self)
        if not preserve_uid:
            node._uid = str(uuid.uuid4())
        node._metadata = dict(self._metadata or {})
        node._global_metadata = dict(self._global_metadata or {})

        node._copy_source = {'uid': self.uid, RAG_KB_ID: self.global_metadata.get(RAG_KB_ID),
                             RAG_DOC_ID: self.global_metadata.get(RAG_DOC_ID)}
        if metadata:
            node._metadata.update(metadata)
        if global_metadata:
            node._global_metadata.update(global_metadata)
        return node

`do_embedding(embed)`

Execute embedding computation.

Parameters:

embed (Dict[str, Callable]) –

Target embedding objects.

Source code in lazyllm/tools/rag/doc_node.py

    def do_embedding(self, embed: Dict[str, Callable]) -> None:
        """
Execute embedding computation.

Args:
    embed (Dict[str, Callable]): Target embedding objects.
"""
        generate_embed = {k: e(self.get_text(MetadataMode.EMBED)) for k, e in embed.items()}
        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding = {**self.embedding, **generate_embed}

`get_children_str()`

Get string representation of child nodes.

Returns:

str: Returns a string representing a dictionary where keys are group names and values are lists of child node UIDs in that group.

Source code in lazyllm/tools/rag/doc_node.py

    def get_children_str(self) -> str:
        """Get string representation of child nodes.

**Returns:**

- str: Returns a string representing a dictionary where keys are group names and values are lists of child node UIDs in that group.
"""
        return str(
            {key: [node._uid for node in nodes] for key, nodes in self.children.items()}
        )

`get_content()`

Get the node's content text with metadata in LLM mode.

Returns:

str: Returns the node's text content with formatted metadata information according to LLM mode.

Source code in lazyllm/tools/rag/doc_node.py

    def get_content(self) -> str:
        """Get the node's content text with metadata in LLM mode.

**Returns:**

- str: Returns the node's text content with formatted metadata information according to LLM mode.
"""
        return self.get_text(MetadataMode.LLM)

`get_parent_id()`

Get the unique identifier of the parent node.

Returns:

str: Returns the parent node's UID, or an empty string if there is no parent node.

Source code in lazyllm/tools/rag/doc_node.py

    def get_parent_id(self) -> str:
        """Get the unique identifier of the parent node.

**Returns:**

- str: Returns the parent node's UID, or an empty string if there is no parent node.
"""
        return self.parent._uid if self.parent else ''

`get_text(metadata_mode=MetadataMode.NONE)`

Combine metadata and content.

Parameters:

metadata_mode (MetadataMode, default: NONE ) –

Same as the parameter in get_metadata_str.

Source code in lazyllm/tools/rag/doc_node.py

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:
        """
Combine metadata and content.

Args:
    metadata_mode: Same as the parameter in get_metadata_str.
"""
        metadata_str = self.get_metadata_str(metadata_mode).strip()
        if not metadata_str:
            return self.text if self.text else ''
        return f'{metadata_str}\n\n{self.text}'.strip()

`has_missing_embedding(embed_keys)`

Check for missing embedding vectors.

Parameters:

embed_keys (Union[str, List[str]]) –

List of target keys.

Source code in lazyllm/tools/rag/doc_node.py

    def has_missing_embedding(self, embed_keys: Union[str, List[str]]) -> List[str]:
        """
Check for missing embedding vectors.

Args:
    embed_keys (Union[str, List[str]]): List of target keys.
"""
        if isinstance(embed_keys, str): embed_keys = [embed_keys]
        assert len(embed_keys) > 0, 'The embed_keys to be checked must be passed in.'
        if self.embedding is None: return embed_keys
        return [k for k in embed_keys if k not in self.embedding]

`set_embedding(embed_key, embed_value)`

Set embedding vector for document node.

Set the embedding vector value for specified key in document node, used for subsequent retrieval and similarity calculation.

Parameters:

embed_key (str) –

Key name of the embedding vector
embed_value –

Value of the embedding vector

Returns:

None –

None

Source code in lazyllm/tools/rag/doc_node.py

    def set_embedding(self, embed_key, embed_value) -> None:
        """Set embedding vector for document node.

Set the embedding vector value for specified key in document node, used for subsequent retrieval and similarity calculation.

Args:
    embed_key (str): Key name of the embedding vector
    embed_value: Value of the embedding vector

Returns:
    None
"""
        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding[embed_key] = embed_value

`to_dict()`

Convert to dictionary format

Source code in lazyllm/tools/rag/doc_node.py

    def to_dict(self) -> Dict:
        """
Convert to dictionary format
"""
        return dict(content=self._content, embedding=self.embedding, metadata=self.metadata)

`with_score(score)`

Shallow copy the original node and add a semantic relevance score.

Parameters:

score –

Relevance score.

Source code in lazyllm/tools/rag/doc_node.py

    def with_score(self, score):
        """
Shallow copy the original node and add a semantic relevance score.

Args:
    score: Relevance score.
"""
        node = self.copy(preserve_uid=True)
        node.relevance_score = score
        return node

`with_sim_score(score)`

Shallow copy the original node and add a similarity score.

Parameters:

score –

Similarity score.

Source code in lazyllm/tools/rag/doc_node.py

    def with_sim_score(self, score):
        """
Shallow copy the original node and add a similarity score.

Args:
    score: Similarity score.
"""
        node = self.copy(preserve_uid=True)
        node.similarity_score = score
        return node

`lazyllm.tools.rag.doc_node.QADocNode`

Bases: DocNode

Question-Answer document node class for storing QA pair data.

Parameters:

query (str) –

The question text.
answer (str) –

The answer text.
uid (str, default: None ) –

Unique identifier.
group (str, default: None ) –

Document group name.
embedding (Dict[str, List[float]], default: None ) –

Dictionary of embedding vectors.
parent (DocNode, default: None ) –

Reference to the parent node.
metadata (Dict[str, Any], default: None ) –

Node-level metadata.
global_metadata (Dict[str, Any], default: None ) –

Document-level metadata.
text (str, default: None ) –

Node content, mutually exclusive with query.

Source code in lazyllm/tools/rag/doc_node.py

class QADocNode(DocNode):
    """Question-Answer document node class for storing QA pair data.

Args:
    query (str): The question text.
    answer (str): The answer text.
    uid (str): Unique identifier.
    group (str): Document group name.
    embedding (Dict[str, List[float]]): Dictionary of embedding vectors.
    parent (DocNode): Reference to the parent node.
    metadata (Dict[str, Any]): Node-level metadata.
    global_metadata (Dict[str, Any]): Document-level metadata.
    text (str): Node content, mutually exclusive with query.
"""
    def __init__(self, query: str, answer: str, uid: Optional[str] = None, group: Optional[str] = None,
                 embedding: Optional[Dict[str, List[float]]] = None, parent: Optional['DocNode'] = None,
                 metadata: Optional[Dict[str, Any]] = None, global_metadata: Optional[Dict[str, Any]] = None,
                 *, text: Optional[str] = None):
        super().__init__(uid, query, group, embedding, parent, metadata=metadata,
                         global_metadata=global_metadata, text=text)
        self._answer = answer.strip()

    @property
    def answer(self) -> str:
        return self._answer

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:
        """Get the text content of the node.

Args:
    metadata_mode (MetadataMode): Metadata mode, defaults to MetadataMode.NONE.
        When set to MetadataMode.LLM, returns formatted QA pair.
        For other modes, returns base class text format.

**Returns:**

- str: The formatted text content.
"""
        if metadata_mode == MetadataMode.LLM:
            return f'query:\n{self.text}\nanswer\n{self._answer}'
        return super().get_text(metadata_mode)

`get_text(metadata_mode=MetadataMode.NONE)`

Get the text content of the node.

Parameters:

metadata_mode (MetadataMode, default: NONE ) –

Metadata mode, defaults to MetadataMode.NONE. When set to MetadataMode.LLM, returns formatted QA pair. For other modes, returns base class text format.

Returns:

str: The formatted text content.

Source code in lazyllm/tools/rag/doc_node.py

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:
        """Get the text content of the node.

Args:
    metadata_mode (MetadataMode): Metadata mode, defaults to MetadataMode.NONE.
        When set to MetadataMode.LLM, returns formatted QA pair.
        For other modes, returns base class text format.

**Returns:**

- str: The formatted text content.
"""
        if metadata_mode == MetadataMode.LLM:
            return f'query:\n{self.text}\nanswer\n{self._answer}'
        return super().get_text(metadata_mode)

`lazyllm.tools.rag.dataReader.SimpleDirectoryReader`

Bases: ModuleBase

A modular document directory reader that inherits from ModuleBase, supporting reading various document formats from the file system and converting them into standardized DocNode objects.

This class supports direct file input or directory input (mutually exclusive). It provides built-in readers for common formats such as PDF, DOCX, PPTX, images, CSV, Excel, audio/video, etc., while also allowing users to register custom file readers.

Parameters:

input_dir (Optional[str], default: None ) –

Input directory path. Mutually exclusive with input_files. Must exist in the file system if provided.
input_files (Optional[List], default: None ) –

Directly specified list of files. Mutually exclusive with input_dir. Each file must exist either in the provided path or under config['data_path'].
exclude (Optional[List], default: None ) –

List of file patterns to exclude from processing.
exclude_hidden (bool, default: True ) –

Whether to exclude hidden files. Defaults to True.
recursive (bool, default: False ) –

Whether to recursively read subdirectories. Defaults to False.
encoding (str, default: 'utf-8' ) –

Encoding format of text files. Defaults to "utf-8".
filename_as_id (bool, default: False ) –

Deprecated argument. No longer used. A warning will be logged if provided.
required_exts (Optional[List[str]], default: None ) –

Whitelist of file extensions to process. Only files with these extensions will be read.
file_extractor (Optional[Dict[str, Callable]], default: None ) –

Dictionary of custom file readers. Keys are filename patterns, values are reader callables.
fs (Optional[AbstractFileSystem], default: None ) –

Custom file system to use. Defaults to the system's default file system.
metadata_genf (Optional[Callable[[str], Dict]], default: None ) –

Metadata generation function that takes a file path and returns a metadata dictionary. Defaults to an internal implementation (_DefaultFileMetadataFunc).
num_files_limit (Optional[int], default: None ) –

Maximum number of files to read. If exceeded, only the first N files are processed.
return_trace (bool, default: False ) –

Whether to return processing trace information. Defaults to False.
metadatas (Optional[Dict], default: None ) –

Predefined global metadata dictionary to attach to all documents.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.dataReader import SimpleDirectoryReader
>>> reader = SimpleDirectoryReader(input_dir="yourpath/",recursive=True,exclude=["*.tmp"],required_exts=[".pdf", ".docx"])
>>> documents = reader.load_data()

Source code in lazyllm/tools/rag/dataReader.py

class SimpleDirectoryReader(ModuleBase):
    """
A modular document directory reader that inherits from ModuleBase, supporting reading various document formats from the file system and converting them into standardized DocNode objects.

This class supports direct file input or directory input (mutually exclusive). It provides built-in readers for common formats such as PDF, DOCX, PPTX, images, CSV, Excel, audio/video, etc., while also allowing users to register custom file readers.

Args:
    input_dir (Optional[str]): Input directory path. Mutually exclusive with input_files.
                               Must exist in the file system if provided.
    input_files (Optional[List]): Directly specified list of files. Mutually exclusive with input_dir.
                                  Each file must exist either in the provided path or under `config['data_path']`.
    exclude (Optional[List]): List of file patterns to exclude from processing.
    exclude_hidden (bool): Whether to exclude hidden files. Defaults to True.
    recursive (bool): Whether to recursively read subdirectories. Defaults to False.
    encoding (str): Encoding format of text files. Defaults to "utf-8".
    filename_as_id (bool): Deprecated argument. No longer used. A warning will be logged if provided.
    required_exts (Optional[List[str]]): Whitelist of file extensions to process. Only files with these extensions will be read.
    file_extractor (Optional[Dict[str, Callable]]): Dictionary of custom file readers. Keys are filename patterns, values are reader callables.
    fs (Optional[AbstractFileSystem]): Custom file system to use. Defaults to the system's default file system.
    metadata_genf (Optional[Callable[[str], Dict]]): Metadata generation function that takes a file path and returns a metadata dictionary.
                                                     Defaults to an internal implementation (_DefaultFileMetadataFunc).
    num_files_limit (Optional[int]): Maximum number of files to read. If exceeded, only the first N files are processed.
    return_trace (bool): Whether to return processing trace information. Defaults to False.
    metadatas (Optional[Dict]): Predefined global metadata dictionary to attach to all documents.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools.dataReader import SimpleDirectoryReader
    >>> reader = SimpleDirectoryReader(input_dir="yourpath/",recursive=True,exclude=["*.tmp"],required_exts=[".pdf", ".docx"])
    >>> documents = reader.load_data()
    """
    default_file_readers: Dict[str, Type[ReaderBase]] = {
        '*.pdf': PDFReader,
        '*.docx': DocxReader,
        '*.hwp': HWPReader,
        '*.pptx': PPTXReader,
        '*.ppt': PPTXReader,
        '*.pptm': PPTXReader,
        '*.gif': ImageReader,
        '*.jpeg': ImageReader,
        '*.jpg': ImageReader,
        '*.png': ImageReader,
        '*.webp': ImageReader,
        '*.ipynb': IPYNBReader,
        '*.epub': EpubReader,
        '*.md': MarkdownReader,
        '*.mbox': MboxReader,
        '*.csv': PandasCSVReader,
        '*.xls': PandasExcelReader,
        '*.xlsx': PandasExcelReader,
        '*.mp3': VideoAudioReader,
        '*.mp4': VideoAudioReader,
        '*.txt': TxtReader,
        '*.xml': TxtReader,
    }

    def __init__(self, input_dir: Optional[str] = None, input_files: Optional[List] = None,
                 exclude: Optional[List] = None, exclude_hidden: bool = True, recursive: bool = False,
                 encoding: str = 'utf-8', filename_as_id: bool = False, required_exts: Optional[List[str]] = None,
                 file_extractor: Optional[Dict[str, Callable]] = None, fs: Optional['fsspec.AbstractFileSystem'] = None,
                 metadata_genf: Optional[Callable[[str], Dict]] = None, num_files_limit: Optional[int] = None,
                 return_trace: bool = False, metadatas: Optional[Dict] = None) -> None:
        super().__init__(return_trace=return_trace)

        self._fs = fs or get_default_fs()
        self._encoding = encoding
        self._exclude = exclude
        self._recursive = recursive
        self._exclude_hidden = exclude_hidden
        self._required_exts = required_exts
        self._num_files_limit = num_files_limit
        self._Path = Path if is_default_fs(self._fs) else PurePosixPath
        self._metadatas = metadatas
        self._input_files = self._get_input_files(input_dir, input_files)
        self._file_extractor = {**self.default_file_readers, **(file_extractor or {})}
        self._metadata_genf = metadata_genf or _DefaultFileMetadataFunc(self._fs)
        if filename_as_id: LOG.warning('Argument `filename_as_id` for DataReader is no longer used')

    def _get_input_files(self, input_dir, input_files):
        if input_files:
            assert not input_dir, 'Cannot provide files and dir at the same time'
            input_files = [os.path.join(config['data_path'], p) if not self._fs.isfile(p) else p for p in input_files]
            input_files = [self._Path(p) if p else (_ for _ in ()).throw(ValueError, f'File {p} does not exist.')
                           for p in input_files]
        elif input_dir:
            if not self._fs.isdir(input_dir):
                raise ValueError(f'Directory {input_dir} does not exist.')
            input_files = self._add_files(self._Path(input_dir))
        return input_files

    def _add_files(self, input_dir: Path) -> List[Path]:  # noqa: C901
        all_files = set()
        rejected_files = set()
        rejected_dirs = set()

        if self._exclude is not None:
            for excluded_pattern in self._exclude:
                if self._recursive:
                    excluded_glob = self._Path(input_dir) / self._Path('**') / excluded_pattern
                else:
                    excluded_glob = self._Path(input_dir) / excluded_pattern
                for file in self._fs.glob(str(excluded_glob)):
                    if self._fs.isdir(file):
                        rejected_dirs.add(self._Path(file))
                    else:
                        rejected_files.add(self._Path(file))

        file_refs: List[str] = []
        if self._recursive:
            file_refs = self._fs.glob(str(input_dir) + '/**/*')
        else:
            file_refs = self._fs.glob(str(input_dir) + '/*')

        for ref in file_refs:
            ref = self._Path(ref)
            is_dir = self._fs.isdir(ref)
            skip_hidden = self._exclude_hidden and self._is_hidden(ref)
            skip_bad_exts = (self._required_exts is not None and ref.suffix not in self._required_exts)
            skip_excluded = ref in rejected_files
            if not skip_excluded:
                if is_dir:
                    ref_parent_dir = ref
                else:
                    ref_parent_dir = self._fs._parent(ref)
                for rejected_dir in rejected_dirs:
                    if str(ref_parent_dir).startswith(str(rejected_dir)):
                        skip_excluded = True
                        LOG.warning(f'Skipping {ref} because it in parent dir '
                                    f'{ref_parent_dir} which is in {rejected_dir}.')
                        break

            if is_dir or skip_hidden or skip_bad_exts or skip_excluded:
                continue
            else:
                all_files.add(ref)

        new_input_files = sorted(all_files)

        if len(new_input_files) == 0:
            raise ValueError(f'No files found in {input_dir}.')
        if self._num_files_limit is not None and self._num_files_limit > 0:
            new_input_files = new_input_files[0: self._num_files_limit]

        LOG.debug(f'[SimpleDirectoryReader] Total files add: {len(new_input_files)}')

        LOG.info(f'input_files: {new_input_files}')
        return new_input_files

    def _is_hidden(self, path: Path) -> bool:
        return any(part.startswith('.') and part not in ['.', '..'] for part in path.parts)

    def _exclude_metadata(self, documents: List[DocNode]) -> List[DocNode]:
        for doc in documents:
            doc._excluded_embed_metadata_keys.extend(
                ['file_name', 'file_type', 'file_size', 'creation_date',
                 'last_modified_date', 'last_accessed_date', 'lazyllm_store_num'])
            doc._excluded_llm_metadata_keys.extend(
                ['file_name', 'file_type', 'file_size', 'creation_date',
                 'last_modified_date', 'last_accessed_date', 'lazyllm_store_num'])
        return documents

    @staticmethod
    def find_extractor_by_file(input_file: Path, file_extractor: Dict[str, Callable], pathm: PurePath = Path):
        """
Select the appropriate file extractor based on filename or suffix.

This function first attempts to match by file extension (e.g., `*.txt`),
and if no match is found, it iterates through the `file_extractor` mapping,
using `fnmatch` for wildcard-based pattern matching (e.g., `*.json`, `**/docs/*.md`).
If no extractor matches, it falls back to the `DefaultReader`.

Args:
    input_file (Path): Path to the input file.
    file_extractor (Dict[str, Callable]): Mapping of filename patterns to extractor functions.
    pathm (PurePath): Path handling module used to construct pattern paths. Defaults to `Path`.

**Returns:**

- Callable: The extractor function matching the file, or `DefaultReader` if none found.
"""
        filename_lower = str(input_file).lower()
        file_suffix = filename_lower.split('.')[-1]
        if extractor := file_extractor.get(f'*.{file_suffix}'): return extractor

        for pattern, extractor in file_extractor.items():
            pt_lower = str(pathm(pattern)).lower()
            match_pattern = pt_lower if pt_lower.endswith('*') else os.path.join(str(pathm.cwd()).lower(), pt_lower)
            if pt_lower.startswith('*'):
                match_pattern = pt_lower
            else:
                base = str(pathm.cwd()).lower()
                match_pattern = os.path.join(base, pt_lower)
            if fnmatch.fnmatch(filename_lower, match_pattern):
                return extractor
        return DefaultReader

    @staticmethod
    def load_file(input_file: Path, metadata_genf: Callable[[str], Dict], file_extractor: Dict[str, Callable],
                  encoding: str = 'utf-8', pathm: PurePath = Path, fs: Optional['fsspec.AbstractFileSystem'] = None,
                  metadata: Optional[Dict] = None) -> List[DocNode]:
        """Load a single file into a list of `DocNode` objects using the appropriate reader.

This method selects the appropriate reader based on filename patterns and applies metadata with the following priority:
`user > reader > metadata_genf`.
Optionally falls back to raw text decoding depending on config.

Args:
    input_file (Path): Path to the input file.
    metadata_genf (Callable): Function to generate metadata from file path.
    file_extractor (Dict[str, Callable]): Mapping of filename patterns to reader callables.
    encoding (str): Text encoding to use when reading files. Default is "utf-8".
    pathm (PurePath): Path handling module for local or remote paths.
    fs (AbstractFileSystem): Optional filesystem abstraction from fsspec.
    metadata (Dict): Optional user-defined metadata overriding auto-generated ones.

**Returns:**

- List[DocNode]: List of parsed documents extracted from the file.
"""
        # metadata priority: user > reader > metadata_genf
        user_metadata: Dict = metadata or {}
        metadata_generated: Dict = metadata_genf(str(input_file)) if metadata_genf else {}
        rd = SimpleDirectoryReader.find_extractor_by_file(input_file, file_extractor, pathm)
        if isinstance(rd, type) and issubclass(rd, TxtReader):
            reader = rd(encoding=encoding)
        elif isinstance(rd, type):
            reader = rd()
        else:
            reader = rd
        kwargs = {'fs': fs} if fs and not is_default_fs(fs) else {}

        try:
            docs = reader(input_file, **kwargs)
        except Exception as e:
            LOG.error(f'Error loading file {input_file}, skip it!')
            LOG.error(f'message: {e}\n Traceback: {traceback.format_tb(e.__traceback__)}')
            return []
        docs = [docs] if isinstance(docs, DocNode) else [] if docs is None else docs

        for doc in docs:
            metadata = metadata_generated.copy()
            metadata.update(doc._global_metadata or {})
            metadata.update(user_metadata)
            doc._global_metadata = metadata

        if config['rag_filename_as_id']:
            for i, doc in enumerate(docs):
                doc._uid = f'{input_file!s}_index_{i}'
        return docs

    def _load_data(self, show_progress: bool = False, num_workers: Optional[int] = None,
                   fs: Optional['fsspec.AbstractFileSystem'] = None, metadatas: Optional[Dict] = None,
                   input_dir: Optional[str] = None, input_files: Optional[List] = None) -> List[DocNode]:
        documents, fs, metadatas = [], fs or self._fs, metadatas or self._metadatas
        process_file = self._get_input_files(input_dir, input_files) if input_dir or input_files else self._input_files

        if num_workers and num_workers >= 1:
            if num_workers > multiprocessing.cpu_count():
                LOG.warning('Specified num_workers exceed number of CPUs in the system. '
                            'Setting `num_workers` down to the maximum CPU count.')
            with multiprocessing.get_context('spawn').Pool(num_workers) as p:
                results = p.starmap(SimpleDirectoryReader.load_file,
                                    zip(process_file, repeat(self._metadata_genf), repeat(self._file_extractor),
                                        repeat(self._encoding), repeat(self._Path),
                                        repeat(self._fs), metadatas or repeat(None)))
                documents = reduce(lambda x, y: x + y, results)
        else:
            if show_progress:
                process_file = tqdm(self._input_files, desc='Loading files', unit='file')
            for input_file, metadata in zip(process_file, metadatas or repeat(None)):
                documents.extend(
                    SimpleDirectoryReader.load_file(
                        input_file=input_file, metadata_genf=self._metadata_genf, file_extractor=self._file_extractor,
                        encoding=self._encoding, pathm=self._Path, fs=self._fs, metadata=metadata))

        return self._exclude_metadata(documents)

    def forward(self, *args, **kwargs) -> List[DocNode]:
        return self._load_data(*args, **kwargs)

    @staticmethod
    def get_default_reader(file_ext: str) -> Callable[[Path, Dict], List[DocNode]]:
        """
Retrieve the default file reader (Reader) based on file extension.

This function looks up the default reader mapping using the file extension
(e.g., `.txt`, `.json`).
If the extension does not start with `"*."`, it automatically prepends it
(e.g., `"txt"` → `"*.txt"`).
Common readers include plain text, JSON, and Markdown readers.

Args:
    file_ext (str): File extension or matching pattern (e.g., `"txt"` or `"*.json"`).

**Returns:**

- Callable[[Path, Dict], List[DocNode]]: The reader function associated with the extension, or `None` if not found.
"""
        if not file_ext.startswith('*.'): file_ext = '*.' + file_ext
        return SimpleDirectoryReader.default_file_readers.get(file_ext)

    @staticmethod
    def add_post_action_for_default_reader(file_ext: str, f: Callable[[DocNode], Union[DocNode, List[DocNode]]]):
        """
Add a post-processing action (Post Action) for a default Reader.

This method allows attaching a custom post-processing function to the default
file reader (Reader), enabling transformation of parsed `DocNode` objects after
initial loading (e.g., text cleaning, node splitting, or structural adjustments).
If the given file extension has no default reader, a `KeyError` is raised.

The post-processing function `f` can be:

1. A subclass of `NodeTransform`;
2. A callable that takes a `DocNode` and returns a modified `DocNode` or a list;
3. A class type, which will be instantiated automatically.

Args:
    file_ext (str): File extension or matching pattern (e.g., `"*.txt"`).
    f (Callable[[DocNode], Union[DocNode, List[DocNode]]]): Post-processing function or node transform class.

**Raises:**

- KeyError: If the specified file extension has no default reader.
"""
        if not file_ext.startswith('*.'): file_ext = '*.' + file_ext
        if file_ext not in SimpleDirectoryReader.default_file_readers:
            raise KeyError(f'{file_ext} has no default reader, use Document.add_reader instead')

        reader = SimpleDirectoryReader.default_file_readers[file_ext]
        assert isinstance(reader, type) and issubclass(reader, ReaderBase)

        if isinstance(f, type): f = f()
        if not isinstance(f, NodeTransform):
            try: f('test')
            except Exception: pass
            else: f = FuncNodeTransform(f, trans_node=False)
        reader.post_action = staticmethod(f)

`add_post_action_for_default_reader(file_ext, f)` `staticmethod`

Add a post-processing action (Post Action) for a default Reader.

This method allows attaching a custom post-processing function to the default file reader (Reader), enabling transformation of parsed DocNode objects after initial loading (e.g., text cleaning, node splitting, or structural adjustments). If the given file extension has no default reader, a KeyError is raised.

The post-processing function f can be:

A subclass of NodeTransform;
A callable that takes a DocNode and returns a modified DocNode or a list;
A class type, which will be instantiated automatically.

Parameters:

file_ext (str) –

File extension or matching pattern (e.g., "*.txt").
f (Callable[[DocNode], Union[DocNode, List[DocNode]]]) –

Post-processing function or node transform class.

Raises:

KeyError: If the specified file extension has no default reader.

Source code in lazyllm/tools/rag/dataReader.py

    @staticmethod
    def add_post_action_for_default_reader(file_ext: str, f: Callable[[DocNode], Union[DocNode, List[DocNode]]]):
        """
Add a post-processing action (Post Action) for a default Reader.

This method allows attaching a custom post-processing function to the default
file reader (Reader), enabling transformation of parsed `DocNode` objects after
initial loading (e.g., text cleaning, node splitting, or structural adjustments).
If the given file extension has no default reader, a `KeyError` is raised.

The post-processing function `f` can be:

1. A subclass of `NodeTransform`;
2. A callable that takes a `DocNode` and returns a modified `DocNode` or a list;
3. A class type, which will be instantiated automatically.

Args:
    file_ext (str): File extension or matching pattern (e.g., `"*.txt"`).
    f (Callable[[DocNode], Union[DocNode, List[DocNode]]]): Post-processing function or node transform class.

**Raises:**

- KeyError: If the specified file extension has no default reader.
"""
        if not file_ext.startswith('*.'): file_ext = '*.' + file_ext
        if file_ext not in SimpleDirectoryReader.default_file_readers:
            raise KeyError(f'{file_ext} has no default reader, use Document.add_reader instead')

        reader = SimpleDirectoryReader.default_file_readers[file_ext]
        assert isinstance(reader, type) and issubclass(reader, ReaderBase)

        if isinstance(f, type): f = f()
        if not isinstance(f, NodeTransform):
            try: f('test')
            except Exception: pass
            else: f = FuncNodeTransform(f, trans_node=False)
        reader.post_action = staticmethod(f)

`find_extractor_by_file(input_file, file_extractor, pathm=Path)` `staticmethod`

Select the appropriate file extractor based on filename or suffix.

This function first attempts to match by file extension (e.g., *.txt), and if no match is found, it iterates through the file_extractor mapping, using fnmatch for wildcard-based pattern matching (e.g., *.json, **/docs/*.md). If no extractor matches, it falls back to the DefaultReader.

Parameters:

input_file (Path) –

Path to the input file.
file_extractor (Dict[str, Callable]) –

Mapping of filename patterns to extractor functions.
pathm (PurePath, default: Path ) –

Path handling module used to construct pattern paths. Defaults to Path.

Returns:

Callable: The extractor function matching the file, or DefaultReader if none found.

Source code in lazyllm/tools/rag/dataReader.py

    @staticmethod
    def find_extractor_by_file(input_file: Path, file_extractor: Dict[str, Callable], pathm: PurePath = Path):
        """
Select the appropriate file extractor based on filename or suffix.

This function first attempts to match by file extension (e.g., `*.txt`),
and if no match is found, it iterates through the `file_extractor` mapping,
using `fnmatch` for wildcard-based pattern matching (e.g., `*.json`, `**/docs/*.md`).
If no extractor matches, it falls back to the `DefaultReader`.

Args:
    input_file (Path): Path to the input file.
    file_extractor (Dict[str, Callable]): Mapping of filename patterns to extractor functions.
    pathm (PurePath): Path handling module used to construct pattern paths. Defaults to `Path`.

**Returns:**

- Callable: The extractor function matching the file, or `DefaultReader` if none found.
"""
        filename_lower = str(input_file).lower()
        file_suffix = filename_lower.split('.')[-1]
        if extractor := file_extractor.get(f'*.{file_suffix}'): return extractor

        for pattern, extractor in file_extractor.items():
            pt_lower = str(pathm(pattern)).lower()
            match_pattern = pt_lower if pt_lower.endswith('*') else os.path.join(str(pathm.cwd()).lower(), pt_lower)
            if pt_lower.startswith('*'):
                match_pattern = pt_lower
            else:
                base = str(pathm.cwd()).lower()
                match_pattern = os.path.join(base, pt_lower)
            if fnmatch.fnmatch(filename_lower, match_pattern):
                return extractor
        return DefaultReader

`get_default_reader(file_ext)` `staticmethod`

Retrieve the default file reader (Reader) based on file extension.

This function looks up the default reader mapping using the file extension (e.g., .txt, .json). If the extension does not start with "*.", it automatically prepends it (e.g., "txt" → "*.txt"). Common readers include plain text, JSON, and Markdown readers.

Parameters:

file_ext (str) –

File extension or matching pattern (e.g., "txt" or "*.json").

Returns:

Callable[[Path, Dict], List[DocNode]]: The reader function associated with the extension, or None if not found.

Source code in lazyllm/tools/rag/dataReader.py

    @staticmethod
    def get_default_reader(file_ext: str) -> Callable[[Path, Dict], List[DocNode]]:
        """
Retrieve the default file reader (Reader) based on file extension.

This function looks up the default reader mapping using the file extension
(e.g., `.txt`, `.json`).
If the extension does not start with `"*."`, it automatically prepends it
(e.g., `"txt"` → `"*.txt"`).
Common readers include plain text, JSON, and Markdown readers.

Args:
    file_ext (str): File extension or matching pattern (e.g., `"txt"` or `"*.json"`).

**Returns:**

- Callable[[Path, Dict], List[DocNode]]: The reader function associated with the extension, or `None` if not found.
"""
        if not file_ext.startswith('*.'): file_ext = '*.' + file_ext
        return SimpleDirectoryReader.default_file_readers.get(file_ext)

`load_file(input_file, metadata_genf, file_extractor, encoding='utf-8', pathm=Path, fs=None, metadata=None)` `staticmethod`

Load a single file into a list of DocNode objects using the appropriate reader.

This method selects the appropriate reader based on filename patterns and applies metadata with the following priority: user > reader > metadata_genf. Optionally falls back to raw text decoding depending on config.

Parameters:

input_file (Path) –

Path to the input file.
metadata_genf (Callable) –

Function to generate metadata from file path.
file_extractor (Dict[str, Callable]) –

Mapping of filename patterns to reader callables.
encoding (str, default: 'utf-8' ) –

Text encoding to use when reading files. Default is "utf-8".
pathm (PurePath, default: Path ) –

Path handling module for local or remote paths.
fs (AbstractFileSystem, default: None ) –

Optional filesystem abstraction from fsspec.
metadata (Dict, default: None ) –

Optional user-defined metadata overriding auto-generated ones.

Returns:

List[DocNode]: List of parsed documents extracted from the file.

Source code in lazyllm/tools/rag/dataReader.py

    @staticmethod
    def load_file(input_file: Path, metadata_genf: Callable[[str], Dict], file_extractor: Dict[str, Callable],
                  encoding: str = 'utf-8', pathm: PurePath = Path, fs: Optional['fsspec.AbstractFileSystem'] = None,
                  metadata: Optional[Dict] = None) -> List[DocNode]:
        """Load a single file into a list of `DocNode` objects using the appropriate reader.

This method selects the appropriate reader based on filename patterns and applies metadata with the following priority:
`user > reader > metadata_genf`.
Optionally falls back to raw text decoding depending on config.

Args:
    input_file (Path): Path to the input file.
    metadata_genf (Callable): Function to generate metadata from file path.
    file_extractor (Dict[str, Callable]): Mapping of filename patterns to reader callables.
    encoding (str): Text encoding to use when reading files. Default is "utf-8".
    pathm (PurePath): Path handling module for local or remote paths.
    fs (AbstractFileSystem): Optional filesystem abstraction from fsspec.
    metadata (Dict): Optional user-defined metadata overriding auto-generated ones.

**Returns:**

- List[DocNode]: List of parsed documents extracted from the file.
"""
        # metadata priority: user > reader > metadata_genf
        user_metadata: Dict = metadata or {}
        metadata_generated: Dict = metadata_genf(str(input_file)) if metadata_genf else {}
        rd = SimpleDirectoryReader.find_extractor_by_file(input_file, file_extractor, pathm)
        if isinstance(rd, type) and issubclass(rd, TxtReader):
            reader = rd(encoding=encoding)
        elif isinstance(rd, type):
            reader = rd()
        else:
            reader = rd
        kwargs = {'fs': fs} if fs and not is_default_fs(fs) else {}

        try:
            docs = reader(input_file, **kwargs)
        except Exception as e:
            LOG.error(f'Error loading file {input_file}, skip it!')
            LOG.error(f'message: {e}\n Traceback: {traceback.format_tb(e.__traceback__)}')
            return []
        docs = [docs] if isinstance(docs, DocNode) else [] if docs is None else docs

        for doc in docs:
            metadata = metadata_generated.copy()
            metadata.update(doc._global_metadata or {})
            metadata.update(user_metadata)
            doc._global_metadata = metadata

        if config['rag_filename_as_id']:
            for i, doc in enumerate(docs):
                doc._uid = f'{input_file!s}_index_{i}'
        return docs

`lazyllm.tools.rag.dataReader.FileReader`

Bases: object

File content reader whose main function is to convert various input file formats into concatenated plain text content.

Parameters:

input_files (Optional[List]) –

Directly specified list of input files.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.dataReader import FileReader
>>> reader = FileReader()
>>> content = reader("yourpath/")

Source code in lazyllm/tools/rag/dataReader.py

class FileReader(object):
    """
File content reader whose main function is to convert various input file formats into concatenated plain text content.

Args:
    input_files (Optional[List]): Directly specified list of input files.


Examples:

    >>> import lazyllm
    >>> from lazyllm.tools.dataReader import FileReader
    >>> reader = FileReader()
    >>> content = reader("yourpath/")
    """

    def __call__(self, input_files):
        file_list = _lazyllm_get_file_list(input_files)
        if isinstance(file_list, str) and file_list is not None:
            file_list = [file_list]
        if len(file_list) == 0:
            return []
        nodes = SimpleDirectoryReader(input_files=file_list)._load_data()
        txt = [node.get_text() for node in nodes]
        return '\n'.join(txt)

`lazyllm.tools.rag.web.DocWebModule`

Bases: ModuleBase

Document Web Interface Module, inherits from ModuleBase, provides web-based document management interface.

Parameters:

doc_server (ServerModule) –

Document server module instance providing backend API support
title (str, default: '文档管理演示终端' ) –

Interface title. Defaults to 'Document Management Demo Terminal'
port (optional, default: None ) –

Service port number or port range. Defaults to None (uses range 20800-20999)
history (optional, default: None ) –

History record list. Defaults to None
text_mode (optional, default: None ) –

Text display mode. Defaults to None (uses dynamic mode)
trace_mode (optional, default: None ) –

Trace mode. Defaults to None (uses refresh mode)

Class Attributes:

Mode: Mode enumeration class containing:
    - Dynamic: Dynamic mode
    - Refresh: Refresh mode
    - Appendix: Appendix mode

Notes

Requires a valid doc_server instance to work with
Automatically tries other ports in range when port conflict occurs
Releases resources when service is stopped

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag.web import DocWebModule
>>> from lazyllm import
>>> doc_server = ServerModule(url="your_url")
>>> doc_web = DocWebModule(
>>>   doc_server=doc_server,
>>>   title="文档管理演示终端",
>>>   port=range(20800, 20805)  # 自动寻找可用端口)
>>> deploy_task = doc_web._get_deploy_tasks()
>>> deploy_task()
>>> print(doc_web.url)
>>> doc_web.stop()

Source code in lazyllm/tools/rag/web.py

class DocWebModule(ModuleBase):
    """Document Web Interface Module, inherits from ModuleBase, provides web-based document management interface.

Args:
    doc_server (ServerModule): Document server module instance providing backend API support
    title (str, optional): Interface title. Defaults to 'Document Management Demo Terminal'
    port (optional): Service port number or port range. Defaults to ``None`` (uses range 20800-20999)
    history (optional): History record list. Defaults to ``None``
    text_mode (optional): Text display mode. Defaults to ``None`` (uses dynamic mode)
    trace_mode (optional): Trace mode. Defaults to ``None`` (uses refresh mode)


Class Attributes:

    Mode: Mode enumeration class containing:
        - Dynamic: Dynamic mode
        - Refresh: Refresh mode
        - Appendix: Appendix mode

Notes:
    - Requires a valid doc_server instance to work with
    - Automatically tries other ports in range when port conflict occurs
    - Releases resources when service is stopped


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag.web import DocWebModule
    >>> from lazyllm import
    >>> doc_server = ServerModule(url="your_url")
    >>> doc_web = DocWebModule(
    >>>   doc_server=doc_server,
    >>>   title="文档管理演示终端",
    >>>   port=range(20800, 20805)  # 自动寻找可用端口)
    >>> deploy_task = doc_web._get_deploy_tasks()
    >>> deploy_task()
    >>> print(doc_web.url)
    >>> doc_web.stop()
    """
    class Mode:
        Dynamic = 0
        Refresh = 1
        Appendix = 2

    def __init__(self, doc_server: ServerModule, title='文档管理演示终端', port=None,
                 history=None, text_mode=None, trace_mode=None) -> None:
        super().__init__()
        self.title = title
        self.port = port or range(20800, 20999)
        self.history = history or []
        self.trace_mode = trace_mode if trace_mode else DocWebModule.Mode.Refresh
        self.text_mode = text_mode if text_mode else DocWebModule.Mode.Dynamic
        self.doc_server = doc_server
        self._deploy_flag = lazyllm.once_flag()
        self.api_url = ''
        self.url = ''

    def _prepare(self, query, chat_history):
        if chat_history is None:
            chat_history = []
        return '', chat_history + [[query, None]]

    def _clear_history(self):
        return [], '', ''

    def _work(self):
        if isinstance(self.port, (range, tuple, list)):
            port = self._find_can_use_network_port()
        else:
            port = self.port
            assert self._verify_port_access(port), f'port {port} is occupied'

        self.api_url = self.doc_server._url.rsplit('/', 1)[0]
        self.web_ui = WebUi(self.api_url)
        self.demo = self.web_ui.create_ui()
        self.url = f'http://127.0.0.1:{port}'
        self.broadcast_url = f'http://0.0.0.0:{port}'

        self.demo.queue().launch(server_name='0.0.0.0', server_port=port, prevent_thread_lock=True)
        LOG.success('LazyLLM docwebmodule launched successfully: Running on: '
                    f'{self.broadcast_url}, local URL: {self.url}')

    def _get_deploy_tasks(self):
        return Pipeline(self._work)

    def _get_post_process_tasks(self):
        return Pipeline(self._print_url)

    def wait(self):
        """Block current thread waiting for web service to run.

This method blocks the calling thread until the web service is explicitly stopped.

"""
        self.demo.block_thread()

    def stop(self):
        """Stops the web interface service and releases related resources.

"""
        demo = self.__dict__.get('demo')
        if demo:
            demo.close()
            del self.demo
        self.url = ''

    def _find_can_use_network_port(self):
        for port in self.port:
            if self._verify_port_access(port):
                return port
        raise RuntimeError(
            f'The ports in the range {self.port} are all occupied. '
            'Please change the port range or release the relevant ports.'
        )

    def _print_url(self):
        lazyllm.LOG.success(f'LazyLLM DocWebModule launched successfully: Running on local URL: {self.url}')

    def _verify_port_access(self, port):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            result = s.connect_ex(('127.0.0.1', port))
            return result != 0

    def __repr__(self):
        return lazyllm.make_repr('Module', 'DocWebModule')

`stop()`

Stops the web interface service and releases related resources.

Source code in lazyllm/tools/rag/web.py

    def stop(self):
        """Stops the web interface service and releases related resources.

"""
        demo = self.__dict__.get('demo')
        if demo:
            demo.close()
            del self.demo
        self.url = ''

`wait()`

Block current thread waiting for web service to run.

This method blocks the calling thread until the web service is explicitly stopped.

Source code in lazyllm/tools/rag/web.py

    def wait(self):
        """Block current thread waiting for web service to run.

This method blocks the calling thread until the web service is explicitly stopped.

"""
        self.demo.block_thread()

`lazyllm.tools.rag.parsing_service.server.DocumentProcessor`

Bases: ModuleBase

Document processing service class, after startup, it can provide document processing services, supporting document addition, deletion and update operations. The service internally adopts a producer-consumer model, manages document processing tasks through a queue, supports asynchronous processing of document tasks, and supports task status callback notifications.

Parameters:

port (Optional[int], default: None ) –

Service port number. Defaults to None, when it is None, a random port will be assigned.
url (Optional[str], default: None ) –

Service URL, when the service URL is provided, the module can remotely connect to the already deployed service, without starting the service again, defaults to None.
num_workers (int, default: 1 ) –

Number of worker threads, defaults to 1, when it is 0, the worker threads are not started, only the service is started.
db_config (Optional[Dict[str, Any]], default: None ) –

Used to configure the database connection information for SqlManager, defaults to None, when it is None, the default database configuration is used.
launcher (Optional[LazyLLMLaunchersBase], default: None ) –

Used to manage the Launcher instance of the service process, defaults to None.
post_func (Optional[Callable], default: None ) –

Used to process the task status callback notification function, defaults to None, when it is None, the task status callback notification is not performed, must provide a function, the function signature is as follows: def post_func(task_id: str, task_status: str = None, error_code: str = None, error_msg: str = None): pass
path_prefix (Optional[str], default: None ) –

Used to configure the prefix of the uploaded file storage path, defaults to None.
lease_duration (float, default: 300.0 ) –

Task lease duration in seconds, defaults to 300.
lease_renew_interval (float, default: 60.0 ) –

Lease renewal interval in seconds, defaults to 60.
high_priority_task_types (Optional[List[str]]) –

High priority task types, defaults to [DOC_DELETE].
high_priority_workers (int) –

Number of high priority workers, defaults to 1.

Examples:

```python
# set db_config
db_config = {
    'db_type': 'sqlite',
    'user': None,
    'password': None,
    'host': None,
    'port': None,
    'db_name': '/xxx/xxx/test.db',
}
# Create server and start it
server = DocumentProcessor(port=28888, db_config=db_config, num_workers=4, post_func=post_func_sample)
server.start()

# start the document with server
server = DocumentProcessor(port=28888, db_config=db_config, num_workers=4, post_func=post_func_sample)
document = Document(dataset_path=None, name="algo_1", display_name="Algo_1",
                    description="Algo_1 for testing", manager=server)
document.start()

# Create remote document processor
remote_server = DocumentProcessor(url="http://remote-server:8080")
document = Document(dataset_path=None, name="algo_1", display_name="Algo_1",
                    description="Algo_1 for testing", manager=remote_server)
document.start()
```

Source code in lazyllm/tools/rag/parsing_service/server.py

class DocumentProcessor(ModuleBase):
    """
Document processing service class, after startup, it can provide document processing services, supporting document addition, deletion and update operations.
The service internally adopts a producer-consumer model, manages document processing tasks through a queue, supports asynchronous processing of document tasks, and supports task status callback notifications.

Args:
    port (Optional[int]): Service port number. Defaults to None, when it is None, a random port will be assigned.
    url (Optional[str]): Service URL, when the service URL is provided, the module can remotely connect to the already deployed service, without starting the service again, defaults to None.
    num_workers (int): Number of worker threads, defaults to 1, when it is 0, the worker threads are not started, only the service is started.
    db_config (Optional[Dict[str, Any]]): Used to configure the database connection information for SqlManager, defaults to None, when it is None, the default database configuration is used.
    launcher (Optional[Launcher]): Used to manage the Launcher instance of the service process, defaults to None.
    post_func (Optional[Callable]): Used to process the task status callback notification function, defaults to None, when it is None, the task status callback notification is not performed, must provide a function, the function signature is as follows:
        def post_func(task_id: str, task_status: str = None, error_code: str = None, error_msg: str = None):
            pass
    path_prefix (Optional[str]): Used to configure the prefix of the uploaded file storage path, defaults to None.
    lease_duration (float): Task lease duration in seconds, defaults to 300.
    lease_renew_interval (float): Lease renewal interval in seconds, defaults to 60.
    high_priority_task_types (Optional[List[str]]): High priority task types, defaults to [DOC_DELETE].
    high_priority_workers (int): Number of high priority workers, defaults to 1.


Examples:

    ```python
    # set db_config
    db_config = {
        'db_type': 'sqlite',
        'user': None,
        'password': None,
        'host': None,
        'port': None,
        'db_name': '/xxx/xxx/test.db',
    }
    # Create server and start it
    server = DocumentProcessor(port=28888, db_config=db_config, num_workers=4, post_func=post_func_sample)
    server.start()

    # start the document with server
    server = DocumentProcessor(port=28888, db_config=db_config, num_workers=4, post_func=post_func_sample)
    document = Document(dataset_path=None, name="algo_1", display_name="Algo_1",
                        description="Algo_1 for testing", manager=server)
    document.start()

    # Create remote document processor
    remote_server = DocumentProcessor(url="http://remote-server:8080")
    document = Document(dataset_path=None, name="algo_1", display_name="Algo_1",
                        description="Algo_1 for testing", manager=remote_server)
    document.start()
    ```
    """

    class _Impl:
        def __init__(self, db_config: Optional[Dict[str, Any]] = None, num_workers: int = 1,
                     post_func: Optional[Callable] = None, path_prefix: Optional[str] = None,
                     callback_url: Optional[str] = None,
                     lease_duration: float = 300.0, lease_renew_interval: float = 60.0,
                     callback_task_statuses: Optional[List[str]] = None,
                     callback_task_types: Optional[List[str]] = None,
                     worker_launcher: Optional[Launcher] = None):
            self._db_config = db_config
            self._num_workers = num_workers
            self._post_func = post_func
            self._callback_url = self._normalize_callback_url(callback_url)
            if not self._check_post_func():
                raise ValueError('Invalid post function!')
            self._shutdown = False
            self._path_prefix = path_prefix
            self._lease_duration = lease_duration
            self._lease_renew_interval = lease_renew_interval
            self._callback_task_statuses = callback_task_statuses
            self._callback_task_types = callback_task_types
            self._callback_retry_attempts: Dict[int, int] = {}
            self._worker_launcher = worker_launcher

            self._db_manager = None
            self._waiting_task_queue = None
            self._finished_task_queue = None
            self._post_func_thread = None
            self._workers = None
            self._schema_extractors: Dict[str, SchemaExtractor] = {}
            self._reader: Optional[DirectoryReader] = None  # global reader, set by register_algorithm
            self._store: Optional['_DocumentStore'] = None  # global store, lazily loaded from DB

        @once_wrapper(reset_on_pickle=True)
        def _lazy_init(self):
            LOG.info('[DocumentProcessor._Impl] Starting lazy initialization...')
            self._db_manager = SqlManager(**self._db_config, tables_info_dict={
                'tables': [ALGORITHM_TABLE_INFO, NODE_GROUP_TABLE_INFO]
            })

            self._waiting_task_queue = Queue(
                table_name=WAITING_TASK_QUEUE_TABLE_INFO['name'],
                columns=WAITING_TASK_QUEUE_TABLE_INFO['columns'],
                db_config=self._db_config,
                order_by='task_score',
                order_desc=False,
            )
            self._finished_task_queue = Queue(
                table_name=FINISHED_TASK_QUEUE_TABLE_INFO['name'],
                columns=FINISHED_TASK_QUEUE_TABLE_INFO['columns'],
                db_config=self._db_config,
                order_by='finished_at',
                order_desc=False,
            )

            self._post_func_thread = threading.Thread(target=self.process_finished_task, daemon=True)
            self._post_func_thread.start()

            if self._num_workers > 0:
                self._workers = Worker(
                    db_config=self._db_config,
                    num_workers=self._num_workers,
                    lease_duration=self._lease_duration,
                    lease_renew_interval=self._lease_renew_interval,
                    callback_task_statuses=self._callback_task_statuses,
                    callback_task_types=self._callback_task_types,
                    launcher=self._worker_launcher,
                )
                self._workers.start()
            # Restore reader from DB if not already set (e.g. after a restart in a distributed setup
            # where register_algorithm has not been called yet by lazyllm-algo).
            if self._reader is None:
                self._reader = self._load_reader_from_db()
                if self._reader is not None:
                    LOG.info('[DocumentProcessor] Reader restored from DB.')
            # Push reader to workers if already set (e.g. register_algorithm called before _lazy_init)
            if self._reader is not None and self._workers is not None:
                try:
                    self._workers.set_reader(self._reader)
                except Exception as e:
                    LOG.warning(f'[DocumentProcessor] Failed to push reader to workers: {e}')
            if self._schema_extractors and self._workers is not None:
                try:
                    self._workers.set_schema_extractors(self._schema_extractors)
                except Exception as e:
                    LOG.warning(f'[DocumentProcessor] Failed to push schema extractors to workers: {e}')
            LOG.info('[DocumentProcessor] Lazy initialization completed!')

        def __getstate__(self):
            state = self.__dict__.copy()
            state['_db_manager'] = None
            state['_waiting_task_queue'] = None
            state['_finished_task_queue'] = None
            state['_post_func_thread'] = None
            state['_workers'] = None
            return state

        def __setstate__(self, state):
            self.__dict__.update(state)

        def process_finished_task(self):
            """process finished task in background thread"""
            while True:
                try:
                    finished_task = self._finished_task_queue.peek()
                    if finished_task:
                        if not self._is_callback_due(finished_task):
                            time.sleep(0.5)
                            continue
                        try:
                            self._callback(finished_task)
                        except Exception as exc:
                            self._schedule_callback_retry(finished_task, exc)
                            time.sleep(0.1)
                            continue
                        self._finished_task_queue.clear(filter_by={'id': finished_task['id']})
                        self._callback_retry_attempts.pop(finished_task['id'], None)
                        time.sleep(0.1)
                    else:
                        time.sleep(1)
                except Exception as e:
                    LOG.error(f'[DocumentProcessor] Failed to process finished task: {e}, {traceback.format_exc()}')
                    time.sleep(10)

        @staticmethod
        def _normalize_callback_url(callback_url: Optional[str]) -> Optional[str]:
            if not callback_url:
                return None
            return callback_url.rstrip('/')

        def set_callback_url(self, callback_url: Optional[str]):
            self._callback_url = self._normalize_callback_url(callback_url)

        @staticmethod
        def _normalize_queue_datetime(value: Any) -> Optional[datetime]:
            if value is None or isinstance(value, datetime):
                return value
            if isinstance(value, str):
                try:
                    return datetime.fromisoformat(value)
                except ValueError:
                    return None
            return None

        def _is_callback_due(self, finished_task: Dict[str, Any]) -> bool:
            finished_at = self._normalize_queue_datetime(finished_task.get('finished_at'))
            return finished_at is None or finished_at <= datetime.now()

        @staticmethod
        def _load_task_context(finished_task: Dict[str, Any]) -> Dict[str, Any]:
            task_context_json = finished_task.get('task_context_json')
            if not task_context_json:
                raise ValueError('task_context_json is missing in finished task queue')
            try:
                task_context = json.loads(task_context_json)
            except json.JSONDecodeError as exc:
                raise ValueError(f'invalid task_context_json: {exc}') from exc
            if not isinstance(task_context, dict):
                raise ValueError('task_context_json must decode to dict')
            return task_context

        def _drop_callback_task(self, finished_task: Dict[str, Any], exc: Exception, attempt: int, reason: str):
            LOG.error('[DocumentProcessor] Callback delivery dropped queue item.'
                      f' queue_id={finished_task.get("id")}'
                      f' task_id={finished_task.get("task_id")}'
                      f' task_status={finished_task.get("task_status")}'
                      f' reason={reason}'
                      f' attempts={attempt}'
                      f' callback_url={finished_task.get("callback_url")}'
                      f' task_context_json={finished_task.get("task_context_json")}'
                      f' error={type(exc).__name__}: {exc}')
            self._finished_task_queue.clear(filter_by={'id': finished_task['id']})
            self._callback_retry_attempts.pop(finished_task['id'], None)

        @staticmethod
        def _parse_retry_after_seconds(value: Optional[str]) -> Optional[float]:
            if not value:
                return None
            try:
                return max(float(value), 0.0)
            except (TypeError, ValueError):
                pass
            try:
                retry_at = parsedate_to_datetime(value)
            except (TypeError, ValueError, IndexError, OverflowError):
                return None
            if retry_at.tzinfo is not None:
                delay = (retry_at - datetime.now(retry_at.tzinfo)).total_seconds()
            else:
                delay = (retry_at - datetime.now()).total_seconds()
            return max(delay, 0.0)

        def _should_retry_callback_error(self, exc: Exception) -> Tuple[bool, Optional[float], str]:
            if isinstance(exc, ValueError):
                return False, None, 'invalid_callback_payload'
            if isinstance(exc, (requests.Timeout, requests.ConnectionError)):
                return True, None, 'transient_network_error'
            if isinstance(exc, requests.HTTPError):
                response = exc.response
                if response is None:
                    return True, None, 'http_error_without_response'
                status_code = response.status_code
                if status_code in (408, 425):
                    return True, None, f'http_{status_code}'
                if status_code == 429:
                    return True, self._parse_retry_after_seconds(response.headers.get('Retry-After')), 'http_429'
                if status_code >= 500:
                    return True, None, f'http_{status_code}'
                if 400 <= status_code < 500:
                    return False, None, f'http_{status_code}'
            return True, None, type(exc).__name__

        def _schedule_callback_retry(self, finished_task: Dict[str, Any], exc: Exception) -> bool:
            queue_id = finished_task['id']
            attempt = self._callback_retry_attempts.get(queue_id, 0) + 1
            self._callback_retry_attempts[queue_id] = attempt
            should_retry, retry_after_seconds, reason = self._should_retry_callback_error(exc)
            if not should_retry:
                self._drop_callback_task(finished_task, exc, attempt, reason)
                return False
            if attempt >= CALLBACK_RETRY_MAX_ATTEMPTS:
                self._drop_callback_task(finished_task, exc, attempt, 'retry_exhausted')
                return False
            delay = CALLBACK_RETRY_MIN_INTERVAL * (2 ** (attempt - 1))
            delay *= random.uniform(0.8, 1.2)
            if retry_after_seconds is not None:
                delay = max(delay, retry_after_seconds)
            delay = min(delay, CALLBACK_RETRY_MAX_INTERVAL)
            retry_at = datetime.now() + timedelta(seconds=delay)
            self._finished_task_queue.update(filter_by={'id': queue_id}, finished_at=retry_at)
            LOG.warning(f'[DocumentProcessor] Callback delivery failed for queue_id={queue_id},'
                        f' task_id={finished_task.get("task_id")}, retry in {delay:.1f}s'
                        f' (attempt={attempt}/{CALLBACK_RETRY_MAX_ATTEMPTS})'
                        f' reason={reason}'
                        f' error={type(exc).__name__}: {exc}')
            return True

        def _resolve_callback_url(self, payload: Dict[str, Any]) -> Optional[str]:
            return self._normalize_callback_url(
                payload.get('callback_url') or payload.get('feedback_url') or self._callback_url
            )

        def _default_post_func(self, finished_task: Dict[str, Any]):
            task_id = finished_task.get('task_id')
            task_status = finished_task.get('task_status')
            error_code = finished_task.get('error_code')
            error_msg = finished_task.get('error_msg')
            callback_url = self._normalize_callback_url(finished_task.get('callback_url'))
            if not callback_url:
                raise ValueError(f'callback_url is missing for task {task_id}')
            task_context = self._load_task_context(finished_task)

            base_payload = {'task_type': task_context.get('task_type'),
                            'kb_id': task_context.get('kb_id')}
            items = task_context.get('items') or [{}]
            for index, item in enumerate(items):
                callback_payload = {
                    # uuid5: deterministic id based on task_id+status+index, ensures idempotent callbacks
                    'callback_id': str(uuid5(NAMESPACE_URL, f'{task_id}:{task_status}:{index}')),
                    'task_id': task_id,
                    'task_status': task_status,
                    'payload': {k: v for k, v in {**base_payload, **item}.items() if v is not None},
                }
                for field in ('task_type', 'kb_id'):
                    if base_payload.get(field) is not None:
                        callback_payload[field] = base_payload[field]
                if item.get('doc_id') is not None:
                    callback_payload['doc_id'] = item['doc_id']
                if error_code is not None:
                    callback_payload['error_code'] = error_code
                if error_msg is not None:
                    callback_payload['error_msg'] = error_msg

                response = requests.post(callback_url, json=callback_payload, timeout=8)
                response.raise_for_status()
            return True

        def _register_schema_extractor(self, algo_name: str, schema_extractor):
            if schema_extractor is None:
                return
            ext_name = schema_extractor.name or algo_name
            existing = self._schema_extractors.get(ext_name)
            if existing is not None and existing is not schema_extractor:
                if existing._active_schema_set_id and schema_extractor._active_schema_set_id:
                    if existing._active_schema_set_id != schema_extractor._active_schema_set_id:
                        raise ValueError(
                            f'Schema extractor name conflict: "{ext_name}" already registered with a different schema.'
                        )
            self._schema_extractors[ext_name] = schema_extractor

        def _validate_reader(self, reader, policy: str = 'none'):
            if reader is None:
                return
            incoming_sig = reader.signature()

            if self._reader is None:
                self._reader = reader
                return

            if policy == 'force':
                LOG.warning(
                    f'[DocumentProcessor] force policy: overwriting reader '
                    f'(old_sig={self._reader.signature()}, new_sig={incoming_sig})'
                )
                self._reader = reader
            elif policy == 'update':
                # mirror inner changes, cannot change reader signature
                if self._reader.signature() != incoming_sig:
                    raise ValueError(
                        f'[DocumentProcessor] update policy: reader signature mismatch '
                        f'(existing={self._reader.signature()}, incoming={incoming_sig}). '
                        'Set LAZYLLM_ALGO_REGISTER_POLICY=force to override.'
                    )
                self._reader = reader
            else:
                LOG.info(
                    f'[DocumentProcessor] Reader already registered, skipping '
                    f'(sig={self._reader.signature()}). '
                    'Set LAZYLLM_ALGO_REGISTER_POLICY=update or force to overwrite.'
                )

        def register_algorithm(self, name: str, store: _DocumentStore, reader: DirectoryReader,
                               node_groups: Dict[str, Dict], schema_extractor: Optional[SchemaExtractor] = None,
                               display_name: Optional[str] = None, description: Optional[str] = None,
                               policy: str = 'none'):
            # NOTE: name is the algorithm id, display_name is the algorithm display name
            self._lazy_init()
            LOG.info(f'[DocumentProcessor] Register algorithm: name={name}, display_name={display_name}')
            self._register_schema_extractor(name, schema_extractor)
            self._validate_reader(reader, policy)
            try:
                # Upsert node groups and algorithm in a single transaction to ensure atomicity.
                info_pickle = dump_obj({'reader': reader, 'store': store})
                with self._db_manager.get_session() as session:
                    node_group_ids = self._upsert_node_groups(node_groups, policy=policy, session=session)
                    ng_ids_json = json.dumps(node_group_ids)
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    existing = session.query(AlgoInfo).filter(AlgoInfo.id == name).first()
                    if not existing:
                        session.add(AlgoInfo(
                            id=name, display_name=display_name, description=description,
                            info_pickle=info_pickle, node_group_ids=ng_ids_json,
                            created_at=datetime.now(), updated_at=datetime.now(),
                        ))
                    else:
                        existing.info_pickle = info_pickle
                        existing.display_name = display_name
                        existing.description = description
                        existing.node_group_ids = ng_ids_json
                        existing.updated_at = datetime.now()
                LOG.info(f'[DocumentProcessor] Algorithm {name!r} registered with {len(node_group_ids)} node groups.')
                # Push reader to workers if they are already running
                if reader is not None and self._workers is not None:
                    try:
                        self._workers.set_reader(reader)
                    except Exception as e:
                        LOG.warning(f'[DocumentProcessor] Failed to push reader to workers: {e}')
                if schema_extractor is not None and self._workers is not None:
                    try:
                        self._workers.set_schema_extractors(self._schema_extractors)
                    except Exception as e:
                        LOG.warning(f'[DocumentProcessor] Failed to push schema extractors to workers: {e}')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to register algorithm: {e}, {traceback.format_exc()}')
                raise

        def _upsert_single_node_group(self, sess, ng_name: str, sig: str, cfg: Dict, policy: str) -> str:
            """Upsert one node group row; return its id. Raises on signature conflict unless policy==force."""
            NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
            existing = sess.query(NodeGroupInfo).filter(NodeGroupInfo.name == ng_name).first()
            if existing:
                if existing.signature != sig:
                    if policy == 'force':
                        LOG.warning(f'[DocumentProcessor] force policy: overwriting node group {ng_name!r} '
                                    f'(old_sig={existing.signature}, new_sig={sig})')
                        # Preserve runtime-mutable fields (e.g. lazy_mode) that are stored in
                        # info_pickle but are not part of the static node-group config.
                        merged_cfg = dict(cfg)
                        try:
                            old_cfg = load_obj(existing.info_pickle)
                            if isinstance(old_cfg, dict) and 'lazy_mode' in old_cfg:
                                merged_cfg['lazy_mode'] = old_cfg['lazy_mode']
                        except Exception:
                            pass
                        existing.signature, existing.info_pickle = sig, dump_obj(merged_cfg)
                        existing.updated_at = datetime.now()
                    else:
                        raise ValueError(f'Node group {ng_name!r} already registered with different signature '
                                         f'(existing={existing.signature}, new={sig}). Use a different name or version.')
                return existing.id
            ng_id = str(uuid4())
            sess.add(NodeGroupInfo(id=ng_id, name=ng_name, signature=sig, info_pickle=dump_obj(cfg),
                                   created_at=datetime.now(), updated_at=datetime.now()))
            LOG.info(f'[DocumentProcessor] Node group {ng_name!r} registered with id={ng_id}')
            return ng_id

        def _upsert_node_groups(self, node_groups: Dict[str, Dict], policy: str = 'none', session=None) -> List[str]:
            # TODO(wangzhohng): resolve circular import and moving these imports to the top of the file
            from ..doc_impl import _compute_node_group_signature, NodeGroupType
            from .impl import _NodeGroupDependencyGraph
            from ..store import LAZY_ROOT_NAME, LAZY_IMAGE_GROUP

            _BUILTIN_GROUPS = {LAZY_ROOT_NAME, LAZY_IMAGE_GROUP}
            reader_sig = self._reader.signature() if self._reader is not None else ''
            # Build signatures in true topological order (parent before child)
            name_to_id: Dict[str, str] = {}
            name_to_sig: Dict[str, str] = {}
            ordered_names = _NodeGroupDependencyGraph(node_groups, list(node_groups.keys())).topological_order

            def _upsert_in_session(sess):
                for ng_name in ordered_names:
                    cfg = node_groups[ng_name]
                    parent = cfg.get('parent', 'root')
                    ref = cfg.get('ref')
                    parent_sig = '' if not parent or parent == LAZY_ROOT_NAME else name_to_sig.get(parent, reader_sig)
                    ref_sig = name_to_sig.get(ref, '') if ref else ''
                    transform = cfg.get('transform') or cfg.get('args')
                    group_type = cfg.get('group_type', NodeGroupType.CHUNK)
                    sig = _compute_node_group_signature(ng_name, transform, parent_sig, ref_sig, group_type)
                    name_to_sig[ng_name] = sig
                    # Built-in groups (lazyllm_root, image) may have signature drift across versions;
                    # always upsert them without signature validation.
                    effective_policy = 'force' if ng_name in _BUILTIN_GROUPS else policy
                    name_to_id[ng_name] = self._upsert_single_node_group(sess, ng_name, sig, cfg, effective_policy)

            if session is not None:
                _upsert_in_session(session)
            else:
                with self._db_manager.get_session() as sess:
                    _upsert_in_session(sess)
            return [name_to_id[n] for n in ordered_names]

        def register_new_node_group(self, name: str, config: Dict, algo_name: str, session=None) -> str:
            self._lazy_init()
            LOG.info(f'[DocumentProcessor] Register new node group: name={name}')
            try:
                from ..doc_impl import _compute_node_group_signature, NodeGroupType
                policy = lazyllm.config['algo_register_policy'].strip().lower()
                transform = config.get('transform') or config.get('args')
                group_type = config.get('group_type', NodeGroupType.CHUNK)
                sig = _compute_node_group_signature(name, transform, '', '', group_type)

                def _do_register(sess):
                    ng_id = self._upsert_single_node_group(sess, name, sig, config, policy)
                    # Append the id to the caller's algorithm node_group_ids so workers can discover it.
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    algo = sess.query(AlgoInfo).filter(AlgoInfo.id == algo_name).first()
                    if not algo: raise ValueError(f'Algorithm {algo_name} not found')
                    ids = json.loads(algo.node_group_ids or '[]')
                    if ng_id not in ids:
                        ids.append(ng_id)
                        algo.node_group_ids = json.dumps(ids)
                        algo.updated_at = datetime.now()
                    return ng_id

                if session is not None:
                    return _do_register(session)
                with self._db_manager.get_session() as sess:
                    return _do_register(sess)
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to register node group: {e}, {traceback.format_exc()}')
                raise

        def drop_algorithm(self, name: str) -> None:
            try:
                self._lazy_init()
                with self._db_manager.get_session() as session:
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    existing_algorithm = session.query(AlgoInfo).filter(AlgoInfo.id == name).first()
                    if existing_algorithm:
                        session.delete(existing_algorithm)
                        LOG.info(f'[DocumentProcessor] Algorithm {name} dropped!')
                    else:
                        LOG.warning(f'[DocumentProcessor] Algorithm {name} not found')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to drop algorithm: {e}, {traceback.format_exc()}')
                raise e

        def drop_node_group(self, name: str) -> None:
            self._lazy_init()
            LOG.info(f'[DocumentProcessor] Drop node group: name={name}')
            try:
                with self._db_manager.get_session() as session:
                    NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
                    ng = session.query(NodeGroupInfo).filter(NodeGroupInfo.name == name).first()
                    if not ng:
                        LOG.warning(f'[DocumentProcessor] Node group {name!r} not found')
                        return
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    referencing = [
                        a.id for a in session.query(AlgoInfo).all()
                        if ng.id in json.loads(a.node_group_ids or '[]')
                    ]
                    if referencing:
                        raise ValueError(
                            f"Node group '{name}' is referenced by algorithm(s): {referencing}. "
                            'Delete those algorithms first.'
                        )
                    session.delete(ng)
                    LOG.info(f'[DocumentProcessor] Node group {name!r} dropped.')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to drop node group: {e}, {traceback.format_exc()}')
                raise

        def _get_algo(self, algo_id: str) -> Optional[Dict[str, Any]]:
            with self._db_manager.get_session() as session:
                AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                algorithm = session.query(AlgoInfo).filter(AlgoInfo.id == algo_id).first()
                if algorithm is None:
                    return None
                algo_dict = _orm_to_dict(algorithm)
                # Load node_groups from lazyllm_node_group using node_group_ids
                ng_ids = json.loads(algo_dict.get('node_group_ids') or '[]')
                if ng_ids:
                    NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
                    ngs = {
                        row.id: row for row in
                        session.query(NodeGroupInfo).filter(NodeGroupInfo.id.in_(ng_ids)).all()
                    }
                    node_groups = {}
                    id_to_name = {}
                    for ng_id in ng_ids:
                        row = ngs.get(ng_id)
                        if row is None:
                            LOG.warning(f'[DocumentProcessor] Node group id={ng_id} not found in DB')
                            continue
                        cfg = load_obj(row.info_pickle)
                        node_groups[row.name] = cfg
                        id_to_name[ng_id] = row.name
                    algo_dict['node_groups'] = node_groups
                    algo_dict['ng_id_to_name'] = id_to_name
                else:
                    # Backward compat: node_groups embedded in info_pickle
                    info = load_obj(algo_dict.get('info_pickle', ''))
                    algo_dict['node_groups'] = info.get('node_groups', {}) if isinstance(info, dict) else {}
                    algo_dict['ng_id_to_name'] = {}
                return algo_dict

        @app.get('/health')
        def get_health(self) -> None:
            self._lazy_init()
            if self._post_func_thread is None or not self._post_func_thread.is_alive():
                return BaseResponse(code=503, msg='Post function thread not alive')

            return BaseResponse(code=200, msg='success')

        @app.get('/prestop')
        def get_prestop(self) -> None:
            """
            PreStop lifecycle hook endpoint.
            Called before the container is terminated to allow graceful shutdown.
            This endpoint returns immediately after setting shutdown flag.
            Actual cleanup is handled by the worker thread in background.
            K8s will wait terminationGracePeriodSeconds before sending SIGTERM.
            """
            LOG.info('[DocumentProcessor] PreStop hook called, initiating graceful shutdown...')
            try:
                if not self._shutdown:
                    self._shutdown = True
                    # shutdown threads
                    if self._post_func_thread is not None and self._post_func_thread.is_alive():
                        self._post_func_thread.join(timeout=5.0)
                        if self._post_func_thread.is_alive():
                            LOG.warning('[DocumentProcessor] Post function thread did not stop within timeout')
                        else:
                            LOG.info('[DocumentProcessor] Post function thread stopped')
                    if self._workers:
                        self._workers.stop()
                        LOG.info('[DocumentProcessor] Workers stopped')
                    LOG.info('[DocumentProcessor] Shutdown initiated')
                else:
                    LOG.info('[DocumentProcessor] Shutdown already initiated')
                return BaseResponse(code=200, msg='shutdown initiated')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] PreStop hook failed: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500,
                                            detail=f'PreStop hook failed: {e}, {traceback.format_exc()}')

        @app.get('/algo/list')
        def get_algo_list(self) -> None:
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            res = []
            with self._db_manager.get_session() as session:
                AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                algorithms = session.query(AlgoInfo).all()
                for algorithm in algorithms:
                    res.append(_orm_to_dict(algorithm))
            data = []
            for algo_dict in res:
                data.append({
                    'algo_id': algo_dict.get('id'),
                    'display_name': algo_dict.get('display_name'),
                    'description': algo_dict.get('description'),
                    'created_at': algo_dict.get('created_at'),
                    'updated_at': algo_dict.get('updated_at'),
                })
            if not data:
                LOG.warning('[DocumentProcessor] No algorithm registered')
            return BaseResponse(code=200, msg='success', data=data)

        @app.get('/algo/{algo_id}/groups')
        def get_algo_groups(self, algo_id: str) -> None:
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            try:
                algorithm = self._get_algo(algo_id)
                if algorithm is None:
                    raise fastapi.HTTPException(status_code=404, detail=f'Invalid algo_id {algo_id}')
                ng_ids = json.loads(algorithm.get('node_group_ids') or '[]')
                id_to_name = algorithm.get('ng_id_to_name') or {}
                node_groups = algorithm.get('node_groups') or {}  # {name: cfg}
                # Load store to check which groups are active
                store = self._get_or_init_store(algorithm)
                active_groups = set(store.activated_groups()) if store is not None else set(node_groups.keys())
                # Build name -> {type, display_name} mapping from node_groups cfg
                name_to_meta: Dict[str, Dict] = {}
                for ng_name, cfg in node_groups.items():
                    gt = cfg.get('group_type')
                    type_str = (gt.value if hasattr(gt, 'value') else str(gt)) if gt is not None else ''
                    name_to_meta[ng_name] = {
                        'type': type_str,
                        'display_name': cfg.get('display_name') or ng_name,
                    }
                data = []
                for ng_id in ng_ids:
                    ng_name = id_to_name.get(ng_id, ng_id)
                    if ng_name not in node_groups:
                        continue
                    data.append({
                        'id': ng_id,
                        'name': ng_name,
                        'type': name_to_meta.get(ng_name, {}).get('type', ''),
                        'display_name': name_to_meta.get(ng_name, {}).get('display_name', ''),
                        'active': ng_name in active_groups,
                    })
                return BaseResponse(code=200, msg='success', data=data)
            except fastapi.HTTPException:
                raise
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to get algo groups: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to get algo groups: {str(e)}')

        @app.get('/algo/{algo_id}/group/info')
        def get_algo_group_info(self, algo_id: str) -> None:
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            try:
                data = self._get_algo_group_info_data(algo_id)
                LOG.info(f'[DocumentProcessor] Get group info for {algo_id} success with {data}')
                return BaseResponse(code=200, msg='success', data=data)
            except fastapi.HTTPException:
                raise
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to get group info: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to get group info: {str(e)}')

        @app.post('/ng/{group_name}/lazy_mode')
        def set_node_group_lazy_mode(self, group_name: str,
                                     lazy_mode: Optional[str] = None):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            if lazy_mode not in (None, 'embed', 'all'):
                raise fastapi.HTTPException(
                    status_code=400,
                    detail=f'lazy_mode must be null, "embed" or "all", got {lazy_mode!r}',
                )
            try:
                with self._db_manager.get_session() as session:
                    NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
                    row = session.query(NodeGroupInfo).filter(NodeGroupInfo.name == group_name).first()
                    if row is None:
                        raise fastapi.HTTPException(status_code=404, detail=f'Node group {group_name!r} not in DB')
                    cfg = load_obj(row.info_pickle)
                    cfg['lazy_mode'] = lazy_mode
                    row.info_pickle = dump_obj(cfg)
                    row.updated_at = datetime.now()
                    session.add(row)
                return BaseResponse(code=200, msg='success',
                                    data={'group_name': group_name, 'lazy_mode': lazy_mode})
            except fastapi.HTTPException:
                raise
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to set lazy_mode: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to set lazy_mode: {str(e)}')

        @app.get('/ng/{group_name}/lazy_mode')
        def get_node_group_lazy_mode(self, group_name: str):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            try:
                with self._db_manager.get_session() as session:
                    NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
                    row = session.query(NodeGroupInfo).filter(NodeGroupInfo.name == group_name).first()
                    if row is None:
                        raise fastapi.HTTPException(status_code=404, detail=f'Node group {group_name!r} not in DB')
                    cfg = load_obj(row.info_pickle)
                    return BaseResponse(code=200, msg='success',
                                        data={'group_name': group_name, 'lazy_mode': cfg.get('lazy_mode')})
            except fastapi.HTTPException:
                raise
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to get lazy_mode: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to get lazy_mode: {str(e)}')

        def _get_or_init_store(self, algorithm: dict, init: bool = False) -> Optional['_DocumentStore']:
            """Return the global _DocumentStore, lazily loaded from DB info_pickle.

            When init=True the store is fully initialized (connects to the vector DB and
            resolves embed dims).  Callers that only need metadata (e.g. activated_groups)
            should pass init=False (the default) to avoid triggering embed model calls
            before inject_model_config has been called.
            """
            if self._store is not None:
                return self._store
            info = load_obj(algorithm.get('info_pickle'))
            store: Optional[_DocumentStore] = info.get('store') if isinstance(info, dict) else None
            if store is None:
                return None
            if init:
                try:
                    store._seg_init()
                    self._store = store
                except Exception as e:
                    LOG.warning(f'[DocumentProcessor] _get_or_init_store: _seg_init failed: {e}')
                    return None
            return store

        def _load_reader_from_db(self) -> Optional[DirectoryReader]:
            """Restore the global DirectoryReader from any registered algorithm's info_pickle in DB.
            Called on startup to recover reader state without waiting for lazyllm-algo to re-register."""
            try:
                with self._db_manager.get_session() as session:
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    row = session.query(AlgoInfo).first()
                    if row is None:
                        return None
                    info = load_obj(row.info_pickle)
                    if not isinstance(info, dict):
                        return None
                    return info.get('reader')
            except Exception as e:
                LOG.warning(f'[DocumentProcessor] Failed to restore reader from DB: {e}')
                return None

        def _get_algo_group_info_data(self, algo_id: str):
            algorithm = self._get_algo(algo_id)
            if algorithm is None:
                raise fastapi.HTTPException(status_code=404, detail=f'Invalid algo_id {algo_id}')
            # node_groups is {name: cfg} built by _get_algo from lazyllm_node_group table
            node_groups = algorithm.get('node_groups') or {}
            data = []
            for group_name, cfg in node_groups.items():
                group_info = {'name': group_name,
                              'type': cfg.get('group_type'),
                              'display_name': cfg.get('display_name')}
                data.append(group_info)
            return data

        @staticmethod
        def _format_chunk_item(segment: Dict[str, Any]) -> Dict[str, Any]:
            return {
                'uid': segment.get('uid'),
                'doc_id': segment.get('doc_id'),
                'kb_id': segment.get('kb_id'),
                'group': segment.get('group'),
                'number': segment.get('number', 0),
                'content': segment.get('content'),
                'type': segment.get('type'),
                'parent': segment.get('parent'),
                'metadata': segment.get('meta', {}),
                'global_metadata': segment.get('global_meta', {}),
                'answer': segment.get('answer', ''),
                'image_keys': segment.get('image_keys', []),
            }

        def _list_doc_chunks_data(
            self, algo_id: str, kb_id: str, doc_id: str, group: str, offset: int = 0, limit: int = 20
        ) -> Dict[str, Any]:
            algorithm = self._get_algo(algo_id)
            if algorithm is None:
                raise fastapi.HTTPException(status_code=404, detail=f'Invalid algo_id {algo_id}')
            store = self._get_or_init_store(algorithm, init=True)
            if store is None:
                raise fastapi.HTTPException(status_code=503, detail='Store not initialized for algo')
            node_groups = algorithm.get('node_groups') or {}  # {name: cfg} built by _get_algo
            if group not in node_groups or not store.is_group_active(group):
                raise fastapi.HTTPException(status_code=400, detail=f'Invalid group {group}')
            offset = max(offset, 0)
            limit = max(limit, 1)
            segments, total = store.get_segments(
                doc_ids={doc_id},
                kb_id=kb_id,
                group=group,
                offset=offset,
                limit=limit,
                return_total=True,
                sort_by_number=True,
            )
            return {
                'items': [self._format_chunk_item(segment) for segment in segments],
                'total': total,
                'offset': offset,
                'page_size': limit,
            }

        @app.get('/doc/chunks')
        def list_doc_chunks(
            self,
            algo_id: str = '__default__',
            kb_id: Optional[str] = None,
            doc_id: Optional[str] = None,
            group: Optional[str] = None,
            offset: int = 0,
            page_size: int = 20,
        ):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            if not kb_id:
                raise fastapi.HTTPException(status_code=400, detail='kb_id is required')
            if not doc_id:
                raise fastapi.HTTPException(status_code=400, detail='doc_id is required')
            if not group:
                raise fastapi.HTTPException(status_code=400, detail='group is required')
            data = self._list_doc_chunks_data(
                algo_id=algo_id,
                kb_id=kb_id,
                doc_id=doc_id,
                group=group,
                offset=offset,
                limit=page_size,
            )
            return BaseResponse(code=200, msg='success', data=data)

        @staticmethod
        def _resolve_add_task_type(request: AddDocRequest) -> str:
            try:
                return _resolve_add_doc_task_type(request)
            except ValueError as exc:
                raise fastapi.HTTPException(status_code=400, detail=str(exc)) from exc

        @app.post('/doc/add')
        def add_doc(self, request: AddDocRequest):  # noqa: C901
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            LOG.info(f'[DocumentProcessor] Received add doc request (raw): {request.model_dump()}')
            task_id = request.task_id
            file_infos = request.file_infos
            if not file_infos:
                raise fastapi.HTTPException(status_code=400, detail='file_infos is required')
            # NOTE: No idempotency key check, should be handled by the caller!
            for file_info in file_infos:
                if self._path_prefix:
                    file_info.file_path = create_file_path(path=file_info.file_path, prefix=self._path_prefix)
            task_type = self._resolve_add_task_type(request)
            payload = request.model_dump()
            resolved_callback_url = self._resolve_callback_url(payload)
            if resolved_callback_url:
                payload['callback_url'] = resolved_callback_url
            payload_json = json.dumps(payload, ensure_ascii=False)

            try:
                user_priority = request.priority if request.priority is not None else 0
                task_score = _calculate_task_score(task_type, user_priority)
                now = datetime.now()
                self._waiting_task_queue.enqueue(
                    task_id=task_id,
                    task_type=task_type,
                    user_priority=user_priority,
                    task_score=task_score,
                    message=payload_json,
                    status=TaskStatus.WAITING.value,
                    worker_id=None,
                    lease_expires_at=None,
                    created_at=now,
                    updated_at=now,
                )
                LOG.info(f'[DocumentProcessor] Task {task_id} (type={task_type}, user_priority={user_priority}, '
                         f'score={task_score}) submitted to database queue successfully')
                data = {
                    'task_id': task_id,
                    'task_type': task_type,
                    'created_at': datetime.now(),
                }
                return BaseResponse(code=200, msg='success', data=data)
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to submit task: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to submit task: {str(e)}')

        @app.post('/doc/meta/update')
        def update_meta(self, request: UpdateMetaRequest):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            LOG.info(f'[DocumentProcessor] Received update meta request (raw): {request.model_dump()}')
            task_id = request.task_id
            file_infos = request.file_infos

            if not file_infos:
                raise fastapi.HTTPException(status_code=400, detail='file_infos is required')
            payload = request.model_dump()
            LOG.info(f'[DocumentProcessor] Received update meta request: {payload}')
            resolved_callback_url = self._resolve_callback_url(payload)
            if resolved_callback_url:
                payload['callback_url'] = resolved_callback_url
            payload_json = json.dumps(payload, ensure_ascii=False)
            try:
                task_type = TaskType.DOC_UPDATE_META.value
                user_priority = request.priority if request.priority is not None else 0
                task_score = _calculate_task_score(task_type, user_priority)
                now = datetime.now()
                self._waiting_task_queue.enqueue(
                    task_id=task_id,
                    task_type=task_type,
                    user_priority=user_priority,
                    task_score=task_score,
                    message=payload_json,
                    status=TaskStatus.WAITING.value,
                    worker_id=None,
                    lease_expires_at=None,
                    created_at=now,
                    updated_at=now,
                )
                LOG.info(f'[DocumentProcessor] Update meta task {task_id} (user_priority={user_priority}, '
                         f'score={task_score}) submitted to database queue successfully')
                data = {
                    'task_id': task_id,
                    'task_type': TaskType.DOC_UPDATE_META.value,
                    'created_at': datetime.now(),
                }
                return BaseResponse(code=200, msg='success', data=data)
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to submit update meta task: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to submit task: {str(e)}')

        @app.delete('/doc/delete')
        def delete_doc(self, request: DeleteDocRequest):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            LOG.info(f'[DocumentProcessor] Received delete doc request (raw): {request.model_dump()}')
            task_id = request.task_id
            doc_ids = request.doc_ids
            if not doc_ids:
                raise fastapi.HTTPException(status_code=400, detail='doc_ids is required')
            payload = request.model_dump()
            LOG.info(f'[DocumentProcessor] Received delete doc request: {payload}')
            resolved_callback_url = self._resolve_callback_url(payload)
            if resolved_callback_url:
                payload['callback_url'] = resolved_callback_url
            payload_json = json.dumps(payload, ensure_ascii=False)
            try:
                task_type = TaskType.DOC_DELETE.value
                user_priority = request.priority if request.priority is not None else 0
                task_score = _calculate_task_score(task_type, user_priority)
                now = datetime.now()
                self._waiting_task_queue.enqueue(
                    task_id=task_id,
                    task_type=task_type,
                    user_priority=user_priority,
                    task_score=task_score,
                    message=payload_json,
                    status=TaskStatus.WAITING.value,
                    worker_id=None,
                    lease_expires_at=None,
                    created_at=now,
                    updated_at=now,
                )
                LOG.info(f'[DocumentProcessor] Delete task {task_id} (user_priority={user_priority}, '
                         f'score={task_score}) submitted to database queue successfully')
                data = {
                    'task_id': task_id,
                    'task_type': TaskType.DOC_DELETE.value,
                    'created_at': datetime.now(),
                }
                return BaseResponse(code=200, msg='success', data=data)
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to submit delete task: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to submit task: {str(e)}')

        @app.post('/doc/cancel')
        def cancel(self, request: CancelTaskRequest):
            self._lazy_init()
            if self._shutdown:
                raise fastapi.HTTPException(status_code=503, detail='Server is shutting down...')
            LOG.info(f'[DocumentProcessor] Received cancel task request (raw): {request.model_dump()}')
            task_id = request.task_id
            try:
                # NOTE: only the task in waiting state can be canceled
                cancel_status = False
                deleted = self._waiting_task_queue.delete(
                    filter_by={'task_id': task_id, 'status': TaskStatus.WAITING.value}
                )
                message = ''
                if deleted:
                    cancel_status = True
                    message = 'Canceled by user'
                else:
                    message = (f'Task {task_id} not found in waiting queue,'
                               ' task may be running or already finished and cannot be canceled!')
                return BaseResponse(
                    code=200,
                    msg='success' if cancel_status else 'failed',
                    data={
                        'task_id': task_id,
                        'cancel_status': cancel_status,
                        'message': message,
                    }
                )
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to cancel task {task_id}: {e}, {traceback.format_exc()}')
                raise fastapi.HTTPException(status_code=500, detail=f'Failed to cancel task: {str(e)}')

        def _check_post_func(self) -> bool:
            """assert post function is callable and params include task_id, task_status, error_code, error_msg"""
            if not self._post_func:
                if self._callback_url:
                    LOG.info('[DocumentProcessor] No custom post function configured,'
                             ' using built-in HTTP callback')
                else:
                    LOG.warning('[DocumentProcessor] No custom post function configured, built-in HTTP callback'
                                ' will only run when callback_url or feedback_url is provided in task request')
                return True
            if not callable(self._post_func):
                LOG.error('[DocumentProcessor] Post function is not callable')
                return False
            try:
                sig = inspect.signature(self._post_func)
            except (TypeError, ValueError):
                LOG.error('[DocumentProcessor] Failed to inspect post function signature')
                return False
            params = sig.parameters
            has_var_keyword = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values())
            if not has_var_keyword and not all(
                param in params for param in ['task_id', 'task_status', 'error_code', 'error_msg']
            ):
                LOG.error('[DocumentProcessor] Post function params do not include'
                          ' task_id, task_status, error_code, error_msg')
                return False
            return True

        def _callback(self, finished_task: Optional[Dict[str, Any]] = None, **legacy_kwargs):
            """callback to service"""
            if finished_task is None:
                finished_task = legacy_kwargs
            task_id = finished_task.get('task_id')
            task_type = finished_task.get('task_type')
            task_status = finished_task.get('task_status')
            error_code = finished_task.get('error_code')
            error_msg = finished_task.get('error_msg')
            message = f'Task {task_id} callback status: {task_status}.'
            if error_msg:
                message += f' Error code: {error_code}, error_msg: {error_msg}.'
            LOG.info(f'[DocumentProcessor] {message}')

            try:
                if self._post_func:
                    if 'task_type' in self._post_func.__code__.co_varnames:
                        self._post_func(task_id, task_status, error_code, error_msg, task_type=task_type)
                    else:
                        self._post_func(task_id, task_status, error_code, error_msg)
                else:
                    self._default_post_func(finished_task)
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Failed to call post function: {e}, {traceback.format_exc()}')
                if self._post_func:
                    try:
                        self._default_post_func(finished_task)
                    except Exception:
                        raise e
                else:
                    raise e

        def __call__(self, func_name: str, *args, **kwargs):
            return getattr(self, func_name)(*args, **kwargs)

    def __init__(self, port: int = None, url: str = None, num_workers: int = 1,
                 db_config: Optional[Dict[str, Any]] = None,
                 store_conf: Optional[Dict] = None,
                 reader: Optional['DirectoryReader'] = None,
                 launcher: Optional[Launcher] = None, post_func: Optional[Callable] = None,
                 path_prefix: Optional[str] = None, callback_url: Optional[str] = None, lease_duration: float = 300.0,
                 lease_renew_interval: float = 60.0, pythonpath: Optional[str] = None,
                 callback_task_statuses: Optional[List[str]] = None,
                 callback_task_types: Optional[List[str]] = None):
        super().__init__()
        self._raw_impl = None  # save the reference of the original Impl object
        self._db_config = db_config if db_config else _get_default_db_config('doc_task_management')
        self._store_conf = store_conf
        if store_conf is not None:
            key = json.dumps(self._db_config, sort_keys=True)
            if key in _PROC_STORE_REGISTRY:
                if _PROC_STORE_REGISTRY[key] != store_conf:
                    raise ValueError(
                        f'DocumentProcessor instances sharing the same db_config must use the same store_conf. '
                        f'Expected {_PROC_STORE_REGISTRY[key]!r}, got {store_conf!r}.'
                    )
            else:
                _PROC_STORE_REGISTRY[key] = store_conf
        self._reader = reader
        if reader is not None:
            key = json.dumps(self._db_config, sort_keys=True)
            if key in _PROC_READER_REGISTRY:
                if _PROC_READER_REGISTRY[key].signature() != reader.signature():
                    raise ValueError(
                        'DocumentProcessor instances sharing the same db_config must use the same reader.'
                    )
            else:
                _PROC_READER_REGISTRY[key] = reader
        if not url:
            # DocumentProcessor and its Workers are lightweight orchestration
            # (task queue polling, callbacks) with no GPU needs; default to a
            # local EmptyLauncher so they don't inherit LAZYLLM_DEFAULT_LAUNCHER
            # (e.g. 'sco' in CI) and try to srun for no reason. Callers can
            # still override by passing ``launcher=...`` explicitly.
            import lazyllm as _lazyllm
            effective_launcher = launcher if launcher is not None else _lazyllm.launchers.empty(sync=False)
            # create the Impl object (lazy loading, no threads created)
            self._raw_impl = DocumentProcessor._Impl(
                num_workers=num_workers,
                db_config=self._db_config,
                post_func=post_func,
                path_prefix=path_prefix,
                lease_duration=lease_duration,
                lease_renew_interval=lease_renew_interval,
                callback_url=callback_url,
                callback_task_statuses=callback_task_statuses,
                callback_task_types=callback_task_types,
                worker_launcher=effective_launcher,
            )
            self._impl = ServerModule(
                self._raw_impl, port=port, launcher=effective_launcher, pythonpath=pythonpath,
            )
        else:
            self._impl = UrlModule(url=ensure_call_endpoint(url))

    def start(self):
        """
Start the document processing service.
This method will start the service port and start the worker threads, and subsequent documents can be processed using this service.
If the worker thread number is set to greater than 0 in the service, the worker threads will be started, otherwise only the service will be started.
"""
        # start the server
        result = super().start()
        # ensure the initialization
        if self._raw_impl:
            LOG.info('[DocumentProcessor] Server started, triggering post-start initialization...')
            try:
                self._dispatch('_lazy_init')
                LOG.info('[DocumentProcessor] Post-start initialization triggered successfully')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Post-start initialization failed: {e}, {traceback.format_exc()}')
                raise
        return result

    def wait(self):
        impl = self._impl
        if isinstance(impl, ServerModule):
            return impl.wait()
        LOG.warning('[DocumentProcessor] wait() is no-op in UrlModule mode')

    def set_callback_url(self, callback_url: Optional[str]):
        """Update the callback URL used by the parsing service for task callbacks."""
        if isinstance(self._impl, UrlModule):
            raise RuntimeError('set_callback_url is only supported in local server mode')
        return self._dispatch('set_callback_url', callback_url)

    @property
    def url(self):
        impl = self._impl
        return impl._url if isinstance(impl, ServerModule) else impl.url

    def _dispatch(self, method: str, *args, **kwargs):
        try:
            impl = self._impl
            if isinstance(impl, ServerModule):
                return impl._call(method, *args, **kwargs)
            else:
                return getattr(impl, method)(*args, **kwargs)
        except Exception as e:
            LOG.error(f'[DocumentProcessor] Failed to dispatch method {method}: {e}, {traceback.format_exc()}')
            raise e

    def register_algorithm(self, name: str, store: _DocumentStore, reader: DirectoryReader,
                           node_groups: Dict[str, Dict], schema_extractor: Optional[SchemaExtractor] = None,
                           display_name: Optional[str] = None, description: Optional[str] = None, **kwargs):
        """
Register an algorithm to the document processing service.
The algorithm information will be automatically stored in the database, and can be used to process documents later.
This method must be used with the Document module to work properly, and generally does not need to be called manually.

Args:
    name (str): Algorithm name as unique identifier.
    store (_DocumentStore): _DocumentStore instance for managing document data.
    reader (DirectoryReader): Reader instance for parsing document content.
    node_groups (Dict[str, Dict]): Node group configuration information.
    display_name (Optional[str]): Display name for the algorithm, defaults to None.
    description (Optional[str]): Description of the algorithm, defaults to None.
"""
        assert isinstance(reader, DirectoryReader), 'Only DirectoryReader can be registered to processor'
        self._dispatch('register_algorithm', name, store, reader, node_groups, schema_extractor,
                       display_name, description, **kwargs)

    def register_new_node_group(self, name: str, config: Dict, algo_name: str) -> str:
        """
Dynamically append a new node group to an already-registered algorithm.

The node group is persisted to the database and its ID is appended to the target algorithm's
``node_group_ids`` list so that workers can discover and load it on the next poll cycle.
This is typically called by the ``Document`` module when a new chunking or transformation
strategy is bound at runtime, and generally does not need to be called manually.

Args:
    name (str): Node group name used as a unique identifier.
    config (Dict): Node group configuration containing fields such as ``transform``/``args``,
        ``parent``, ``ref``, and ``group_type``.
    algo_name (str): Unique identifier of the target algorithm whose ``node_group_ids``
        list will receive the new node group ID.

Returns:
    str: The ID of the newly created or already-existing node group.
"""
        return self._dispatch('register_new_node_group', name, config, algo_name)

    def drop_algorithm(self, name: str) -> None:
        """
Remove specified algorithm from document processing service. This method will automatically delete the algorithm information from the database, and the algorithm will no longer be available for subsequent use.

Args:
    name (str): Unique identifier of the algorithm to remove.
"""
        return self._dispatch('drop_algorithm', name)

    def set_node_group_lazy_mode(self, group_name: str,
                                 lazy_mode: Optional[str] = None) -> None:
        """
Set the lazy-loading mode for a specific node group.

Args:
    group_name (str): Name of the node group.
    lazy_mode (Optional[str]): Lazy-loading mode. Accepted values are ``None``, ``"embed"``, or ``"all"``. ``None`` disables lazy mode; ``"embed"`` skips the embedding step; ``"all"`` skips both parsing and embedding steps.
"""
        return self._dispatch('set_node_group_lazy_mode', group_name, lazy_mode)

`drop_algorithm(name)`

Remove specified algorithm from document processing service. This method will automatically delete the algorithm information from the database, and the algorithm will no longer be available for subsequent use.

Parameters:

name (str) –

Unique identifier of the algorithm to remove.

Source code in lazyllm/tools/rag/parsing_service/server.py

    def drop_algorithm(self, name: str) -> None:
        """
Remove specified algorithm from document processing service. This method will automatically delete the algorithm information from the database, and the algorithm will no longer be available for subsequent use.

Args:
    name (str): Unique identifier of the algorithm to remove.
"""
        return self._dispatch('drop_algorithm', name)

`register_algorithm(name, store, reader, node_groups, schema_extractor=None, display_name=None, description=None, **kwargs)`

Register an algorithm to the document processing service. The algorithm information will be automatically stored in the database, and can be used to process documents later. This method must be used with the Document module to work properly, and generally does not need to be called manually.

Parameters:

name (str) –

Algorithm name as unique identifier.
store (_DocumentStore) –

_DocumentStore instance for managing document data.
reader (DirectoryReader) –

Reader instance for parsing document content.
node_groups (Dict[str, Dict]) –

Node group configuration information.
display_name (Optional[str], default: None ) –

Display name for the algorithm, defaults to None.
description (Optional[str], default: None ) –

Description of the algorithm, defaults to None.

Source code in lazyllm/tools/rag/parsing_service/server.py

    def register_algorithm(self, name: str, store: _DocumentStore, reader: DirectoryReader,
                           node_groups: Dict[str, Dict], schema_extractor: Optional[SchemaExtractor] = None,
                           display_name: Optional[str] = None, description: Optional[str] = None, **kwargs):
        """
Register an algorithm to the document processing service.
The algorithm information will be automatically stored in the database, and can be used to process documents later.
This method must be used with the Document module to work properly, and generally does not need to be called manually.

Args:
    name (str): Algorithm name as unique identifier.
    store (_DocumentStore): _DocumentStore instance for managing document data.
    reader (DirectoryReader): Reader instance for parsing document content.
    node_groups (Dict[str, Dict]): Node group configuration information.
    display_name (Optional[str]): Display name for the algorithm, defaults to None.
    description (Optional[str]): Description of the algorithm, defaults to None.
"""
        assert isinstance(reader, DirectoryReader), 'Only DirectoryReader can be registered to processor'
        self._dispatch('register_algorithm', name, store, reader, node_groups, schema_extractor,
                       display_name, description, **kwargs)

`register_new_node_group(name, config, algo_name)`

Dynamically append a new node group to an already-registered algorithm.

The node group is persisted to the database and its ID is appended to the target algorithm's node_group_ids list so that workers can discover and load it on the next poll cycle. This is typically called by the Document module when a new chunking or transformation strategy is bound at runtime, and generally does not need to be called manually.

Parameters:

name (str) –

Node group name used as a unique identifier.
config (Dict) –

Node group configuration containing fields such as transform/args, parent, ref, and group_type.
algo_name (str) –

Unique identifier of the target algorithm whose node_group_ids list will receive the new node group ID.

Returns:

str ( str ) –

The ID of the newly created or already-existing node group.

Source code in lazyllm/tools/rag/parsing_service/server.py

    def register_new_node_group(self, name: str, config: Dict, algo_name: str) -> str:
        """
Dynamically append a new node group to an already-registered algorithm.

The node group is persisted to the database and its ID is appended to the target algorithm's
``node_group_ids`` list so that workers can discover and load it on the next poll cycle.
This is typically called by the ``Document`` module when a new chunking or transformation
strategy is bound at runtime, and generally does not need to be called manually.

Args:
    name (str): Node group name used as a unique identifier.
    config (Dict): Node group configuration containing fields such as ``transform``/``args``,
        ``parent``, ``ref``, and ``group_type``.
    algo_name (str): Unique identifier of the target algorithm whose ``node_group_ids``
        list will receive the new node group ID.

Returns:
    str: The ID of the newly created or already-existing node group.
"""
        return self._dispatch('register_new_node_group', name, config, algo_name)

`set_callback_url(callback_url)`

Update the callback URL used by the parsing service for task callbacks.

Source code in lazyllm/tools/rag/parsing_service/server.py

def set_callback_url(self, callback_url: Optional[str]):
    """Update the callback URL used by the parsing service for task callbacks."""
    if isinstance(self._impl, UrlModule):
        raise RuntimeError('set_callback_url is only supported in local server mode')
    return self._dispatch('set_callback_url', callback_url)

`set_node_group_lazy_mode(group_name, lazy_mode=None)`

Set the lazy-loading mode for a specific node group.

Parameters:

group_name (str) –

Name of the node group.
lazy_mode (Optional[str], default: None ) –

Lazy-loading mode. Accepted values are None, "embed", or "all". None disables lazy mode; "embed" skips the embedding step; "all" skips both parsing and embedding steps.

Source code in lazyllm/tools/rag/parsing_service/server.py

    def set_node_group_lazy_mode(self, group_name: str,
                                 lazy_mode: Optional[str] = None) -> None:
        """
Set the lazy-loading mode for a specific node group.

Args:
    group_name (str): Name of the node group.
    lazy_mode (Optional[str]): Lazy-loading mode. Accepted values are ``None``, ``"embed"``, or ``"all"``. ``None`` disables lazy mode; ``"embed"`` skips the embedding step; ``"all"`` skips both parsing and embedding steps.
"""
        return self._dispatch('set_node_group_lazy_mode', group_name, lazy_mode)

`start()`

Start the document processing service. This method will start the service port and start the worker threads, and subsequent documents can be processed using this service. If the worker thread number is set to greater than 0 in the service, the worker threads will be started, otherwise only the service will be started.

Source code in lazyllm/tools/rag/parsing_service/server.py

    def start(self):
        """
Start the document processing service.
This method will start the service port and start the worker threads, and subsequent documents can be processed using this service.
If the worker thread number is set to greater than 0 in the service, the worker threads will be started, otherwise only the service will be started.
"""
        # start the server
        result = super().start()
        # ensure the initialization
        if self._raw_impl:
            LOG.info('[DocumentProcessor] Server started, triggering post-start initialization...')
            try:
                self._dispatch('_lazy_init')
                LOG.info('[DocumentProcessor] Post-start initialization triggered successfully')
            except Exception as e:
                LOG.error(f'[DocumentProcessor] Post-start initialization failed: {e}, {traceback.format_exc()}')
                raise
        return result

`lazyllm.tools.rag.parsing_service.worker.DocumentProcessorWorker`

Bases: ModuleBase

Document processing consumer thread class, after startup, it will be responsible for processing tasks in the document processing service, and returning the results to the service. The module supports independent deployment, or can automatically start worker threads by setting the num_workers parameter in DocumentProcessor.

Parameters:

db_config (Optional[Dict[str, Any]], default: None ) –

Used to configure the database connection information for SqlManager, defaults to None, when it is None, the default database configuration is used.
num_workers (int, default: 1 ) –

Number of worker threads, defaults to 1, when it is greater than 1, multiple worker threads are started internally based on the ray cluster, otherwise only one worker thread is started.
port (Optional[int], default: None ) –

Service port number. Defaults to None, when it is None, a random port will be assigned.
task_poller (Optional[Callable], default: None ) –

External task poller callback, optional.
poll_mode (str, default: 'thread' ) –

Task polling mode, either "direct" or "thread". - "direct": No dedicated poller thread; the worker pulls and processes tasks when idle (default). - "thread": Run a dedicated poller thread to continuously fetch tasks and enqueue them.
lease_duration (float, default: 300.0 ) –

Task lease duration in seconds, defaults to 300.
lease_renew_interval (float, default: 60.0 ) –

Lease renewal interval in seconds, defaults to 60.
high_priority_task_types (Optional[List[str]]) –

High priority task types, optional.
high_priority_only (bool) –

Process high priority tasks only, defaults to False.

Examples:

```python
db_config = {
    'db_type': 'sqlite',
    'user': None,
    'password': None,
    'host': None,
    'port': None,
    'db_name': '/xxx/xxx/test.db',
}
# Create worker and start it
worker = DocumentProcessorWorker(db_config=db_config, num_workers=2, port=28888)
worker.start()
```

Source code in lazyllm/tools/rag/parsing_service/worker.py

class DocumentProcessorWorker(ModuleBase):
    """
Document processing consumer thread class, after startup, it will be responsible for processing tasks in the document processing service, and returning the results to the service.
The module supports independent deployment, or can automatically start worker threads by setting the num_workers parameter in DocumentProcessor.

Args:
    db_config (Optional[Dict[str, Any]]): Used to configure the database connection information for SqlManager, defaults to None, when it is None, the default database configuration is used.
    num_workers (int): Number of worker threads, defaults to 1, when it is greater than 1, multiple worker threads are started internally based on the ray cluster, otherwise only one worker thread is started.
    port (Optional[int]): Service port number. Defaults to None, when it is None, a random port will be assigned.
    task_poller (Optional[Callable]): External task poller callback, optional.
    poll_mode (str): Task polling mode, either "direct" or "thread".
        - "direct": No dedicated poller thread; the worker pulls and processes tasks when idle (default).
        - "thread": Run a dedicated poller thread to continuously fetch tasks and enqueue them.
    lease_duration (float): Task lease duration in seconds, defaults to 300.
    lease_renew_interval (float): Lease renewal interval in seconds, defaults to 60.
    high_priority_task_types (Optional[List[str]]): High priority task types, optional.
    high_priority_only (bool): Process high priority tasks only, defaults to False.


Examples:

    ```python
    db_config = {
        'db_type': 'sqlite',
        'user': None,
        'password': None,
        'host': None,
        'port': None,
        'db_name': '/xxx/xxx/test.db',
    }
    # Create worker and start it
    worker = DocumentProcessorWorker(db_config=db_config, num_workers=2, port=28888)
    worker.start()
    ```
    """

    class _Impl():
        def __init__(self, db_config: dict = None, task_poller=None, lease_duration: float = 300.0,
                     lease_renew_interval: float = 60.0, poll_mode: str = 'thread',
                     callback_task_statuses: list[str] = None, callback_task_types: list[str] = None):
            self._db_config = db_config if db_config else _get_default_db_config('doc_task_management')
            self._shutdown = False
            self._processor: Optional[_Processor] = None  # global per-worker _Processor
            self._processor_lock = threading.Lock()
            self._store_conf: Optional[Dict] = None  # set by DocumentProcessor before start
            self._reader: Optional[DirectoryReader] = None  # global reader, set by DocumentProcessor before start
            self._waiting_task_queue = None
            self._finished_task_queue = None
            self._worker_thread = None
            self._poller_thread = None
            if task_poller is not None and not callable(task_poller):
                raise TypeError('task_poller is not callable')
            self._task_poller = task_poller
            self._task_poller_impl = self._wrap_task_poller(task_poller) if task_poller else None
            if poll_mode not in ('direct', 'thread'):
                raise ValueError('poll_mode must be one of ["direct", "thread"]')
            self._poll_mode = poll_mode
            self._worker_id = f'{self._get_worker_identity()}-{uuid4()}'
            self._in_progress_task = None
            self._lease_thread = None
            self._lease_stop_event = None
            self._lease_duration = lease_duration
            self._lease_renew_interval = lease_renew_interval
            self._callback_task_statuses = {TaskStatus(status).value for status in callback_task_statuses} \
                if callback_task_statuses else None
            self._callback_task_types = {TaskType(task_type).value for task_type in callback_task_types} \
                if callback_task_types else None

        def __getstate__(self):
            state = self.__dict__.copy()
            state['_processor_lock'] = None
            return state

        def __setstate__(self, state):
            self.__dict__.update(state)
            self._processor_lock = threading.Lock()

        @once_wrapper(reset_on_pickle=True)
        def _lazy_init(self):
            self._waiting_task_queue = Queue(
                table_name=WAITING_TASK_QUEUE_TABLE_INFO['name'],
                columns=WAITING_TASK_QUEUE_TABLE_INFO['columns'],
                db_config=self._db_config,
                order_by='task_score',
                order_desc=False,
            )
            self._finished_task_queue = Queue(
                table_name=FINISHED_TASK_QUEUE_TABLE_INFO['name'],
                columns=FINISHED_TASK_QUEUE_TABLE_INFO['columns'],
                db_config=self._db_config,
            )
            self._db_manager = SqlManager(
                **self._db_config,
                tables_info_dict={'tables': [ALGORITHM_TABLE_INFO, NODE_GROUP_TABLE_INFO,
                                             DOC_NODE_GROUP_STATUS_TABLE_INFO]},
            )
            self._processor_cache_times: dict[str, datetime] = {}  # algo_id -> load time (kept for compat)

            LOG.info(f'{self._log_prefix()} initialized')

        def _get_worker_identity(self) -> str:
            env_keys = ('POD_IP', 'POD_NAME', 'HOSTNAME')
            for key in env_keys:
                value = os.getenv(key)
                if value:
                    return value
            try:
                ip = subprocess.check_output(['hostname', '-i'], text=True).strip()
                if ip:
                    return ip
            except Exception:
                pass
            return 'worker'

        def _log_prefix(self, task_id: str = None) -> str:
            if task_id:
                return f'[DocumentProcessorWorker._Impl][worker_id={self._worker_id}][task_id={task_id}]'
            return f'[DocumentProcessorWorker._Impl][worker_id={self._worker_id}]'

        def _wrap_task_poller(self, task_poller):
            def _impl():
                result = task_poller()
                if result is None:
                    return []
                return result if isinstance(result, list) else [result]
            return _impl

        def _cleanup_stale_tasks(self):  # noqa C901
            """On startup, reset stale state left by crashed workers:
            1. WORKING tasks in lazyllm_waiting_task_queue whose lease has expired → WAITING
            2. WORKING rows in lazyllm_doc_node_group_status that have no corresponding active
               WORKING task in the waiting queue AND whose updated_at is older than
               lease_duration * 2 → FAILED (safe in multi-worker environments: an active worker
               would have updated the row within lease_duration seconds)
            """
            try:
                self._lazy_init()
                now = datetime.now()
                TableCls = self._waiting_task_queue._sql_manager.get_table_orm_class(
                    self._waiting_task_queue._table_name
                )
                with self._waiting_task_queue._sql_manager.get_session() as session:
                    stale = session.query(TableCls).filter(
                        TableCls.status == TaskStatus.WORKING.value,
                        (TableCls.lease_expires_at < now) | (TableCls.lease_expires_at.is_(None))
                    ).all()
                    if stale:
                        for row in stale:
                            row.status = TaskStatus.WAITING.value
                            row.worker_id = None
                            row.lease_expires_at = None
                            row.updated_at = now
                        LOG.warning(f'{self._log_prefix()} Reset {len(stale)} stale WORKING tasks '
                                    f'in waiting queue to WAITING on startup')
            except Exception as e:
                LOG.error(f'{self._log_prefix()} Failed to cleanup stale waiting tasks: {e}')

            try:
                # Collect doc_ids that are actively being processed by a live worker
                # (i.e. WORKING tasks in waiting_queue whose lease has NOT yet expired).
                # ng_status WORKING rows for these docs must NOT be touched.
                active_doc_ids: set = set()
                TableCls = self._waiting_task_queue._sql_manager.get_table_orm_class(
                    self._waiting_task_queue._table_name
                )
                with self._waiting_task_queue._sql_manager.get_session() as session:
                    active_tasks = session.query(TableCls).filter(
                        TableCls.status == TaskStatus.WORKING.value,
                        TableCls.lease_expires_at >= datetime.now(),
                    ).all()
                    for task in active_tasks:
                        try:
                            payload = json.loads(task.message)
                            for fi in payload.get('file_infos') or []:
                                doc_id = fi.get('doc_id')
                                if doc_id:
                                    active_doc_ids.add(doc_id)
                        except Exception:
                            pass

                NgStatus = self._db_manager.get_table_orm_class('lazyllm_doc_node_group_status')
                with self._db_manager.get_session() as session:
                    query = session.query(NgStatus).filter(NgStatus.status == 'WORKING')
                    if active_doc_ids:
                        query = query.filter(NgStatus.doc_id.notin_(active_doc_ids))
                    stale_ng = query.all()
                    if stale_ng:
                        now_ts = datetime.now()
                        for row in stale_ng:
                            row.status = 'FAILED'
                            row.error_msg = 'reset by worker startup cleanup (stale WORKING)'
                            row.updated_at = now_ts
                        LOG.warning(f'{self._log_prefix()} Reset {len(stale_ng)} stale WORKING '
                                    f'node_group_status rows to FAILED on startup '
                                    f'(skipped {len(active_doc_ids)} active doc_ids)')
            except Exception as e:
                LOG.error(f'{self._log_prefix()} Failed to cleanup stale ng_status rows: {e}')

        def _start_lease_renewal(self, task_id: str):
            if self._lease_renew_interval <= 0:
                return
            self._lease_stop_event = threading.Event()

            def _renew():
                while not self._lease_stop_event.wait(self._lease_renew_interval):
                    try:
                        self._waiting_task_queue.extend_lease(task_id, self._worker_id, self._lease_duration)
                    except Exception as e:
                        LOG.warning(f'{self._log_prefix(task_id)} Failed to extend lease: {e}')

            self._lease_thread = threading.Thread(target=_renew, daemon=True)
            self._lease_thread.start()

        def _stop_lease_renewal(self):
            if self._lease_stop_event is not None:
                self._lease_stop_event.set()
            if self._lease_thread is not None and self._lease_thread.is_alive():
                self._lease_thread.join(timeout=2.0)
            self._lease_thread = None
            self._lease_stop_event = None

        def _fail_in_progress_task(self, reason: str):
            if not self._in_progress_task:
                return
            task_id = self._in_progress_task.get('task_id')
            task_type = self._in_progress_task.get('task_type')
            callback_url = self._in_progress_task.get('callback_url')
            task_context_json = self._in_progress_task.get('task_context_json')
            if task_id and task_type:
                self._enqueue_finished_task(
                    task_id=task_id,
                    task_type=task_type,
                    task_status=TaskStatus.FAILED,
                    error_code='PRESTOP',
                    error_msg=reason,
                    callback_url=callback_url,
                    task_context_json=task_context_json,
                )
                deleted = self._waiting_task_queue.delete(
                    filter_by={'task_id': task_id, 'worker_id': self._worker_id}
                )
                if deleted == 0:
                    LOG.warning(f'{self._log_prefix(task_id)} Failed to delete in-progress task')
            self._in_progress_task = None

        def set_reader(self, reader: DirectoryReader):
            if self._reader is not None and self._reader is not reader:
                raise ValueError('reader must be the same across all set_reader calls.')
            self._reader = reader
            return BaseResponse(code=200, msg='success')

        def set_schema_extractors(self, schema_extractors: Dict):
            self._schema_extractors = schema_extractors
            return BaseResponse(code=200, msg='success')

        @app.get('/health')
        def get_health(self):
            self._lazy_init()
            if self._worker_thread is None:
                return BaseResponse(code=503, msg='Worker thread not started')

            if not self._worker_thread.is_alive():
                LOG.error(f'{self._log_prefix()} Worker thread is dead')
                return BaseResponse(code=503, msg='Worker thread is not alive')

            return BaseResponse(code=200, msg='success')

        @app.get('/prestop')
        def get_prestop(self):
            self._shutdown = True
            if self._worker_thread is not None and self._worker_thread.is_alive():
                self._worker_thread.join(timeout=5.0)
                if self._worker_thread.is_alive():
                    LOG.warning(f'{self._log_prefix()} Worker thread did not stop within timeout')
                    self._fail_in_progress_task('prestop timeout')
                else:
                    LOG.info(f'{self._log_prefix()} Worker thread stopped')
            if self._poller_thread is not None and self._poller_thread.is_alive():
                self._poller_thread.join(timeout=5.0)
                if self._poller_thread.is_alive():
                    LOG.warning(f'{self._log_prefix()} Poller thread did not stop within timeout')
                else:
                    LOG.info(f'{self._log_prefix()} Poller thread stopped')
            return BaseResponse(code=200, msg='success')

        def _load_algo_info_from_db(self) -> dict:
            """Load the first registered algorithm's info_pickle from DB.
            Returns the deserialized dict, or {} if not found."""
            self._lazy_init()
            try:
                with self._db_manager.get_session() as session:
                    AlgoInfo = self._db_manager.get_table_orm_class('lazyllm_algorithm')
                    row = session.query(AlgoInfo).first()
                    if row is None:
                        return {}
                    info = load_obj(row.info_pickle)
                    return info if isinstance(info, dict) else {}
            except Exception as e:
                LOG.warning(f'{self._log_prefix()} Failed to load algo info from DB: {e}')
                return {}

        def _get_processor(self) -> _Processor:
            """Return the global per-worker _Processor, creating it lazily on first call."""
            try:
                self._lazy_init()
                with self._processor_lock:
                    if self._processor is not None:
                        return self._processor
                    info = self._load_algo_info_from_db()
                    store = info.get('store') or _DocumentStore(store=self._store_conf or {'type': 'map'})
                    schema_extractors = getattr(self, '_schema_extractors', {})
                    self._processor = _Processor(store=store, schema_extractors=schema_extractors)
                    LOG.info(f'{self._log_prefix()} Created global processor')
                return self._processor
            except Exception as e:
                LOG.warning(f'{self._log_prefix()} Failed to create global processor: {e}')
                raise

        def _load_reader_from_db(self) -> Optional[DirectoryReader]:
            """Load the global reader from any registered algorithm's info_pickle in DB."""
            return self._load_algo_info_from_db().get('reader')

        def _load_all_ng_configs(self, ng_names: Optional[List[str]] = None) -> tuple:
            """Load all node group configs from DB, optionally filtered by ng_names.
            Returns (node_groups: {name: config}, name_to_id: {name: id}).
            """
            self._lazy_init()
            with self._db_manager.get_session() as session:
                NodeGroupInfo = self._db_manager.get_table_orm_class('lazyllm_node_group')
                if ng_names is not None:
                    rows = session.query(NodeGroupInfo).filter(NodeGroupInfo.name.in_(ng_names)).all()
                else:
                    rows = session.query(NodeGroupInfo).all()
            node_groups: Dict[str, Dict] = {}
            name_to_id: Dict[str, str] = {}
            for row in rows:
                node_groups[row.name] = load_obj(row.info_pickle)
                name_to_id[row.name] = row.id
            return node_groups, name_to_id

        def _write_ng_status_batch(self, doc_ids: List[str], ng_ids: List[str], kb_id: str,
                                   status: str, error_msg: Optional[str] = None):
            try:
                now = datetime.now()
                with self._db_manager.get_session() as session:
                    NgStatus = self._db_manager.get_table_orm_class('lazyllm_doc_node_group_status')
                    for doc_id in doc_ids:
                        for ng_id in ng_ids:
                            existing = session.query(NgStatus).filter(
                                NgStatus.doc_id == doc_id, NgStatus.kb_id == kb_id,
                                NgStatus.node_group_id == ng_id,
                            ).first()
                            if existing:
                                existing.status = status
                                existing.error_msg = error_msg
                                existing.updated_at = now
                            else:
                                session.add(NgStatus(
                                    doc_id=doc_id, kb_id=kb_id, node_group_id=ng_id,
                                    status=status, error_msg=error_msg,
                                    created_at=now, updated_at=now,
                                ))
            except Exception as e:
                LOG.error(f'[Worker] _write_ng_status_batch failed: {e}')

        def _wait_and_decide_ng(self, doc_ids: List[str], ng_ids: List[str], kb_id: str,
                                timeout: float = 3600.0) -> tuple:
            import time as _time
            skip_ng_ids: set = set()
            exec_ng_ids: set = set()
            deadline = _time.monotonic() + timeout
            for ng_id in ng_ids:
                while True:
                    with self._db_manager.get_session() as session:
                        NgStatus = self._db_manager.get_table_orm_class('lazyllm_doc_node_group_status')
                        rows = session.query(NgStatus).filter(
                            NgStatus.kb_id == kb_id, NgStatus.node_group_id == ng_id,
                            NgStatus.doc_id.in_(doc_ids),
                        ).all()
                        status_map = {r.doc_id: r.status for r in rows}
                    statuses = [status_map.get(d) for d in doc_ids]
                    if any(s == 'WORKING' for s in statuses):
                        if _time.monotonic() > deadline:
                            LOG.warning(f'{self._log_prefix()} _wait_and_decide_ng timed out '
                                        f'after {timeout}s for ng_id={ng_id}, treating as exec')
                            exec_ng_ids.add(ng_id)
                            break
                        _time.sleep(1)
                        continue
                    if all(s == 'SUCCESS' for s in statuses):
                        skip_ng_ids.add(ng_id)
                    else:
                        exec_ng_ids.add(ng_id)
                    break
            return skip_ng_ids, exec_ng_ids

        def _exec_add_task(self, processor: _Processor, task_id: str, payload: dict,
                           node_groups: Dict[str, Dict], name_to_id: Dict[str, str],
                           reader: Optional[DirectoryReader],
                           extractor_names: Optional[List[str]] = None):
            file_infos = payload.get('file_infos')
            kb_id = payload.get('kb_id', None)
            input_files = []
            ids = []
            metadatas = []
            for file_info in file_infos:
                input_files.append(file_info.get('file_path'))
                ids.append(file_info.get('doc_id'))
                metadatas.append(file_info.get('metadata'))

            ng_ids = [ng_id for name, ng_id in name_to_id.items()
                      if name not in (LAZY_ROOT_NAME, LAZY_IMAGE_GROUP)]
            if kb_id and ng_ids:
                skip_ng_ids, exec_ng_ids = self._wait_and_decide_ng(ids, ng_ids, kb_id)
                if not exec_ng_ids:
                    LOG.info(f'{self._log_prefix(task_id)} All node-groups already SUCCESS, skipping')
                    return
                self._write_ng_status_batch(ids, list(exec_ng_ids), kb_id, 'WORKING')
            else:
                skip_ng_ids, exec_ng_ids = set(), set(ng_ids)

            try:
                processor.add_doc(input_files=input_files, ids=ids, metadatas=metadatas, kb_id=kb_id,
                                  node_groups=node_groups, reader=reader, skip_ng_ids=skip_ng_ids,
                                  extractor_names=extractor_names)
                if kb_id and exec_ng_ids:
                    self._write_ng_status_batch(ids, list(exec_ng_ids), kb_id, 'SUCCESS')
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Execute add task failed: {e}')
                if kb_id and exec_ng_ids:
                    self._write_ng_status_batch(ids, list(exec_ng_ids), kb_id, 'FAILED', str(e))
                raise e

        def _exec_reparse_task(self, processor: _Processor, task_id: str, payload: dict,  # noqa: C901
                               node_groups: Dict[str, Dict], name_to_id: Dict[str, str],
                               reader: Optional[DirectoryReader]):
            file_infos = payload.get('file_infos')
            kb_id = payload.get('kb_id', None)
            ng_names_requested = payload.get('ng_names')  # None means all groups
            strategy = payload.get('strategy', 'rebuild')
            # strategy:
            #   "rebuild"       → full reparse (reslice + reembed all selected groups)
            #   "reembed"       → rebuild vectors (skip slice, reembed existing nodes only)
            #   "slice_missing" → fill gaps (slice + embed only groups with no segments yet)

            reparse_doc_ids = []
            reparse_files = []
            reparse_metadatas = []
            for file_info in file_infos:
                reparse_doc_ids.append(file_info.get('doc_id'))
                reparse_files.append(file_info.get('file_path'))
                reparse_metadatas.append(file_info.get('metadata'))

            LOG.info(f'{self._log_prefix(task_id)} Execute reparse task: doc_ids={reparse_doc_ids!r}, '
                     f'ng_names={ng_names_requested!r}, strategy={strategy!r}, kb_id={kb_id!r}')

            exec_ng_ids = [ng_id for name, ng_id in name_to_id.items()
                           if name not in (LAZY_ROOT_NAME, LAZY_IMAGE_GROUP)]
            if kb_id and exec_ng_ids:
                self._write_ng_status_batch(reparse_doc_ids, exec_ng_ids, kb_id, 'WORKING')

            try:
                # Resolve candidate groups, excluding source groups.
                source_groups = {LAZY_ROOT_NAME, LAZY_IMAGE_GROUP}
                if ng_names_requested is None:
                    candidate_groups = [n for n in node_groups if n not in source_groups]
                else:
                    candidate_groups = [n for n in ng_names_requested if n in node_groups and n not in source_groups]

                # slice_missing: keep only groups with no segments yet.
                if strategy == 'slice_missing':
                    candidate_groups = [
                        n for n in candidate_groups
                        if not processor.store.get_nodes(group=n, doc_ids=reparse_doc_ids, kb_id=kb_id)
                    ]
                    if not candidate_groups:
                        LOG.info(f'{self._log_prefix(task_id)} All requested groups already '
                                 'have nodes, nothing to do')
                        if kb_id and exec_ng_ids:
                            self._write_ng_status_batch(reparse_doc_ids, exec_ng_ids, kb_id, 'SUCCESS')
                        return

                # Top-most filtering: _reparse_group_recursive / _reembed_group already
                # recurse into children, so keep only ancestors with no parent in the set.
                top = []
                for name in candidate_groups:
                    anc = node_groups[name].get('parent')
                    while anc:
                        if anc in candidate_groups:
                            break
                        anc = node_groups.get(anc, {}).get('parent')
                    else:
                        top.append(name)
                candidate_groups = top

                if not candidate_groups:
                    LOG.info(f'{self._log_prefix(task_id)} No valid target groups to reparse')
                    if kb_id and exec_ng_ids:
                        self._write_ng_status_batch(reparse_doc_ids, exec_ng_ids, kb_id, 'SUCCESS')
                    return

                # Build operations: full rebuild (ng_names=None + rebuild) or per-group.
                if ng_names_requested is None and strategy == 'rebuild':
                    operations = [(None, dict(doc_paths=reparse_files, metadatas=reparse_metadatas, reader=reader))]
                else:
                    operations = []
                    for name in candidate_groups:
                        if strategy == 'reembed':
                            operations.append((name, dict(strategy='reembed')))
                        else:
                            operations.append((name, dict(
                                doc_paths=reparse_files, metadatas=reparse_metadatas, reader=reader,
                            )))

                for group_name, extra_kwargs in operations:
                    LOG.info(f'{self._log_prefix(task_id)} [reparse] group={group_name!r} '
                             f'doc_ids={reparse_doc_ids!r} kb_id={kb_id!r} '
                             f'strategy={strategy!r}')
                    processor.reparse(
                        group_name=group_name,
                        node_groups=node_groups,
                        doc_ids=reparse_doc_ids,
                        kb_id=kb_id,
                        **extra_kwargs,
                    )

                if kb_id and exec_ng_ids:
                    self._write_ng_status_batch(reparse_doc_ids, exec_ng_ids, kb_id, 'SUCCESS')
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Execute reparse task failed: {e}')
                if kb_id and exec_ng_ids:
                    self._write_ng_status_batch(reparse_doc_ids, exec_ng_ids, kb_id, 'FAILED', str(e))
                raise e

        def _exec_transfer_task(self, processor: _Processor, task_id: str, payload: dict,
                                node_groups: Dict[str, Dict], name_to_id: Dict[str, str]):
            self._validate_transfer_payload(payload)
            file_infos = payload.get('file_infos')
            kb_id = payload.get('kb_id', None)
            input_files = []
            ids = []
            metadatas = []
            transfer_mode = None
            target_kb_id = None
            target_doc_ids = []

            for file_info in file_infos:
                input_files.append(file_info.get('file_path'))
                ids.append(file_info.get('doc_id'))
                metadatas.append(file_info.get('metadata'))
                if transfer_mode is None:
                    transfer_mode = file_info.get('transfer_params', {}).get('mode')
                if target_kb_id is None:
                    target_kb_id = file_info.get('transfer_params', {}).get('target_kb_id')
                target_doc_ids.append(file_info.get('transfer_params', {}).get('target_doc_id'))

            ng_ids = [ng_id for name, ng_id in name_to_id.items()
                      if name not in (LAZY_ROOT_NAME, LAZY_IMAGE_GROUP)]
            if target_kb_id and ng_ids:
                skip_ng_ids, exec_ng_ids = self._wait_and_decide_ng(target_doc_ids, ng_ids, target_kb_id)
                if not exec_ng_ids:
                    LOG.info(f'{self._log_prefix(task_id)} All node-groups already SUCCESS for transfer, skipping')
                    return
                self._write_ng_status_batch(target_doc_ids, list(exec_ng_ids), target_kb_id, 'WORKING')
            else:
                skip_ng_ids, exec_ng_ids = set(), set(ng_ids)

            try:
                processor.add_doc(input_files=input_files, ids=ids, metadatas=metadatas, kb_id=kb_id,
                                  node_groups=node_groups, reader=None,
                                  transfer_mode=transfer_mode, target_kb_id=target_kb_id,
                                  target_doc_ids=target_doc_ids, skip_ng_ids=skip_ng_ids)
                if target_kb_id and exec_ng_ids:
                    self._write_ng_status_batch(target_doc_ids, list(exec_ng_ids), target_kb_id, 'SUCCESS')
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Execute transfer task failed: {e}')
                if target_kb_id and exec_ng_ids:
                    self._write_ng_status_batch(target_doc_ids, list(exec_ng_ids), target_kb_id, 'FAILED', str(e))
                raise e

        def _exec_delete_task(self, processor: _Processor, task_id: str, payload: dict):
            try:
                kb_id = payload.get('kb_id')
                doc_ids = payload.get('doc_ids')
                node_group_ids_to_delete = payload.get('node_group_ids_to_delete')
                processor.delete_doc(doc_ids=doc_ids, kb_id=kb_id,
                                     node_group_ids_to_delete=node_group_ids_to_delete)
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Execute delete task failed: {e}')
                raise e

        def _exec_update_meta_task(self, processor: _Processor, task_id: str, payload: dict):
            try:
                file_infos = payload.get('file_infos')
                kb_id = payload.get('kb_id', None)
                for file_info in file_infos:
                    doc_id = file_info.get('doc_id')
                    metadata = file_info.get('metadata')
                    processor.update_doc_meta(doc_id=doc_id, metadata=metadata, kb_id=kb_id)
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Execute update meta task failed: {e}')
                raise e

        @staticmethod
        def _resolve_callback_url(payload: dict):
            return payload.get('callback_url') or payload.get('feedback_url')

        @staticmethod
        def _build_task_context(task_type: str, payload: dict) -> dict:
            items = []
            if task_type in (TaskType.DOC_ADD.value, TaskType.DOC_REPARSE.value, TaskType.DOC_UPDATE_META.value):
                file_infos = payload.get('file_infos') or []
                items = [{
                    'doc_id': file_info.get('doc_id'),
                    'file_path': file_info.get('file_path'),
                    'metadata': file_info.get('metadata'),
                } for file_info in file_infos]
            elif task_type == TaskType.DOC_DELETE.value:
                items = [{'doc_id': doc_id} for doc_id in (payload.get('doc_ids') or [])]
            elif task_type == TaskType.DOC_TRANSFER.value:
                file_infos = payload.get('file_infos') or []
                items = [{
                    'doc_id': file_info.get('doc_id'),
                    'file_path': file_info.get('file_path'),
                    'metadata': file_info.get('metadata'),
                    'transfer_params': file_info.get('transfer_params'),
                } for file_info in file_infos]
            return {
                'task_type': task_type,
                'kb_id': payload.get('kb_id'),
                'items': items,
            }

        def _resolve_task_type(self, request: AddDocRequest) -> str:
            return _resolve_add_doc_task_type(request)

        @staticmethod
        def _validate_transfer_payload(payload: dict):  # noqa: C901
            file_infos = payload.get('file_infos')
            if not isinstance(file_infos, list) or not file_infos:
                raise ValueError('file_infos is required for task_type DOC_TRANSFER')
            transfer_mode = None
            target_kb_id = None
            target_doc_ids = set()
            for idx, file_info in enumerate(file_infos):
                transfer_params = file_info.get('transfer_params')
                if not isinstance(transfer_params, dict) or not transfer_params:
                    raise ValueError(f'transfer_params is required for task_type DOC_TRANSFER at index {idx}')
                current_mode = transfer_params.get('mode')
                current_target_kb_id = transfer_params.get('target_kb_id')
                current_target_doc_id = transfer_params.get('target_doc_id')
                if current_mode not in ('cp', 'mv'):
                    raise ValueError('transfer_params.mode must be one of [cp, mv]')
                if not current_target_kb_id:
                    raise ValueError('transfer_params.target_kb_id is required for task_type DOC_TRANSFER')
                if not current_target_doc_id:
                    raise ValueError('transfer_params.target_doc_id is required for task_type DOC_TRANSFER')
                if transfer_mode is not None and transfer_mode != current_mode:
                    raise ValueError('transfer_params.mode must be the same for all files')
                if target_kb_id is not None and target_kb_id != current_target_kb_id:
                    raise ValueError('transfer_params.target_kb_id must be the same for all files')
                if current_target_doc_id in target_doc_ids:
                    raise ValueError('transfer_params.target_doc_id must be unique for all files')
                transfer_mode = current_mode
                target_kb_id = current_target_kb_id
                target_doc_ids.add(current_target_doc_id)

        def _validate_task_payload(self, task_type: str, payload: dict):
            if not isinstance(payload, dict):
                raise ValueError('payload must be a dict')
            if task_type in (
                TaskType.DOC_ADD.value,
                TaskType.DOC_REPARSE.value,
                TaskType.DOC_TRANSFER.value,
                TaskType.DOC_UPDATE_META.value,
            ):
                file_infos = payload.get('file_infos')
                if not isinstance(file_infos, list) or not file_infos:
                    raise ValueError(f'file_infos is required for task_type {task_type}')
            if task_type == TaskType.DOC_DELETE.value:
                doc_ids = payload.get('doc_ids')
                if not isinstance(doc_ids, list) or not doc_ids:
                    raise ValueError('doc_ids is required for task_type DOC_DELETE')
            if task_type == TaskType.DOC_TRANSFER.value:
                self._validate_transfer_payload(payload)

        def _summarize_task_payload(self, task_type: str, payload: dict) -> str:
            summary = {
                'task_type': task_type,
                'kb_id': payload.get('kb_id'),
                'ng_names': payload.get('ng_names'),
            }
            if task_type == TaskType.DOC_DELETE.value:
                summary['doc_ids'] = payload.get('doc_ids', [])
            else:
                file_infos = []
                for file_info in payload.get('file_infos', []):
                    transfer_params = file_info.get('transfer_params') or {}
                    file_infos.append({
                        'doc_id': file_info.get('doc_id'),
                        'file_path': file_info.get('file_path'),
                        'target_doc_id': transfer_params.get('target_doc_id'),
                        'target_kb_id': transfer_params.get('target_kb_id'),
                        'transfer_mode': transfer_params.get('mode'),
                    })
                summary['file_infos'] = file_infos
            return json.dumps(summary, ensure_ascii=False)

        def _enqueue_task_from_payload(self, task: dict):
            try:
                task_type = task.get('task_type')
                if task_type == TaskType.DOC_DELETE.value:
                    task_info = DeleteDocRequest(**task)
                elif task_type == TaskType.DOC_UPDATE_META.value:
                    task_info = UpdateMetaRequest(**task)
                else:
                    task_info = AddDocRequest(**task)
                    task_type = task_type or self._resolve_task_type(task_info)
                task_id = task_info.task_id
                payload = task_info.model_dump()
                self._validate_task_payload(task_type, payload)
                user_priority = task_info.priority if task_info.priority is not None else 0
                task_score = _calculate_task_score(task_type, user_priority)
                payload_json = json.dumps(payload, ensure_ascii=False)
                now = datetime.now()

                self._waiting_task_queue.enqueue(
                    task_id=task_id,
                    task_type=task_type,
                    user_priority=user_priority,
                    task_score=task_score,
                    message=payload_json,
                    status=TaskStatus.WAITING.value,
                    worker_id=None,
                    lease_expires_at=None,
                    created_at=now,
                    updated_at=now,
                )
                LOG.info(f'{self._log_prefix(task_id)} [Poller] task (type={task_type}, '
                         f'user_priority={user_priority}, score={task_score}) '
                         'submitted to database queue successfully')
            except Exception as e:
                LOG.warning(f'{self._log_prefix()} [Poller] Skip invalid task payload: {e}. '
                            f'payload={task}')

        def _parse_task_payload(self, task: dict):
            task_type = task.get('task_type')
            if task_type == TaskType.DOC_DELETE.value:
                task_info = DeleteDocRequest.model_validate(task)
            elif task_type == TaskType.DOC_UPDATE_META.value:
                task_info = UpdateMetaRequest.model_validate(task)
            else:
                task_info = AddDocRequest.model_validate(task)
                task_type = task_type or self._resolve_task_type(task_info)
            task_id = task_info.task_id
            payload = task_info.model_dump(mode='json')
            self._validate_task_payload(task_type, payload)
            return task_id, task_type, payload

        def _run_task(self, task_id: str, task_type: str, payload: dict, from_queue: bool):  # noqa: C901
            callback_url = self._resolve_callback_url(payload)
            task_context_json = json.dumps(self._build_task_context(task_type, payload), ensure_ascii=False)
            try:
                self._in_progress_task = {
                    'task_id': task_id,
                    'task_type': task_type,
                    'callback_url': callback_url,
                    'task_context_json': task_context_json,
                }
                if from_queue:
                    self._start_lease_renewal(task_id)

                # Inject per-request configs into the current
                # thread's lazyllm session so that dynamic-source modules can resolve
                # the correct supplier during _lazy_init and embedding.
                from lazyllm import inject_model_config as _inject_model_config
                from ..readers.ocr_config_inject import inject_ocr_config as _inject_ocr_config
                _inject_model_config(payload.get('llm_config'))
                _inject_ocr_config(payload.get('ocr_config'))

                ng_names = payload.get('ng_names')  # None means all node groups
                extractor_names = payload.get('extractor_names')  # None means all extractors

                LOG.info(f'{self._log_prefix(task_id)} Start processing task: '
                         f'{self._summarize_task_payload(task_type, payload)}')
                self._enqueue_finished_task(
                    task_id=task_id,
                    task_type=task_type,
                    task_status=TaskStatus.WORKING,
                    callback_url=callback_url,
                    task_context_json=task_context_json,
                )

                processor = self._get_processor()
                # For reparse tasks, always load all node groups so that
                # _reparse_group_recursive can access parent/sibling groups.
                # ng_names filtering is handled inside _exec_reparse_task.
                if task_type == TaskType.DOC_REPARSE.value:
                    node_groups, name_to_id = self._load_all_ng_configs(None)
                else:
                    node_groups, name_to_id = self._load_all_ng_configs(ng_names)
                reader = self._reader or self._load_reader_from_db()
                if task_type == TaskType.DOC_ADD.value:
                    self._exec_add_task(processor, task_id, payload, node_groups=node_groups,
                                        name_to_id=name_to_id, reader=reader,
                                        extractor_names=extractor_names)
                elif task_type == TaskType.DOC_REPARSE.value:
                    self._exec_reparse_task(processor, task_id, payload, node_groups=node_groups,
                                            name_to_id=name_to_id, reader=reader)
                elif task_type == TaskType.DOC_DELETE.value:
                    self._exec_delete_task(processor, task_id, payload)
                elif task_type == TaskType.DOC_UPDATE_META.value:
                    self._exec_update_meta_task(processor, task_id, payload)
                elif task_type == TaskType.DOC_TRANSFER.value:
                    self._exec_transfer_task(processor, task_id, payload, node_groups=node_groups,
                                             name_to_id=name_to_id)
                else:
                    raise ValueError(f'{self._log_prefix(task_id)} Unknown task type: {task_type}')

                self._enqueue_finished_task(
                    task_id=task_id,
                    task_type=task_type,
                    task_status=TaskStatus.SUCCESS,
                    error_code='200',
                    error_msg='success',
                    callback_url=callback_url,
                    task_context_json=task_context_json,
                )
                if from_queue:
                    deleted = self._waiting_task_queue.delete(
                        filter_by={'task_id': task_id, 'worker_id': self._worker_id}
                    )
                    if deleted == 0:
                        LOG.warning(f'{self._log_prefix(task_id)} Failed to delete finished task')
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Failed to run task: {e}, {traceback.format_exc()}')
                if task_id and task_type:
                    self._enqueue_finished_task(
                        task_id=task_id,
                        task_type=task_type,
                        task_status=TaskStatus.FAILED,
                        error_code=type(e).__name__,
                        error_msg=str(e),
                        callback_url=callback_url,
                        task_context_json=task_context_json,
                    )
                    if from_queue:
                        deleted = self._waiting_task_queue.delete(
                            filter_by={'task_id': task_id, 'worker_id': self._worker_id}
                        )
                        if deleted == 0:
                            LOG.warning(f'{self._log_prefix(task_id)} Failed to delete failed task')
                time.sleep(WORKER_ERROR_RETRY_INTERVAL)
            finally:
                if from_queue:
                    self._stop_lease_renewal()
                self._in_progress_task = None

        def _poller(self):  # noqa: C901
            while not self._shutdown:
                try:
                    tasks = self._task_poller_impl()
                    if not tasks:
                        time.sleep(0.1)
                        continue
                    for task in tasks:
                        self._enqueue_task_from_payload(task)
                except Exception as e:
                    LOG.error(f'{self._log_prefix()} [Poller] fetch failed: {e}')
                    time.sleep(WORKER_ERROR_RETRY_INTERVAL)
            LOG.info(f'{self._log_prefix()} [Poller] stopped')

        def _poll_task(self):
            return self._waiting_task_queue.claim(
                worker_id=self._worker_id,
                lease_duration=self._lease_duration,
                status_waiting=TaskStatus.WAITING.value,
                status_working=TaskStatus.WORKING.value,
            )

        def _enqueue_finished_task(self, task_id: str, task_type: str, task_status: TaskStatus,
                                   error_code: str = None, error_msg: str = None,
                                   callback_url: str = None, task_context_json: str = None):
            try:
                self._lazy_init()
                if self._callback_task_statuses and task_status.value not in self._callback_task_statuses:
                    return
                if self._callback_task_types and task_type not in self._callback_task_types:
                    return
                self._finished_task_queue.enqueue(
                    task_id=task_id,
                    task_type=task_type,
                    task_status=task_status.value,
                    finished_at=datetime.now(),
                    callback_url=callback_url,
                    task_context_json=task_context_json,
                    error_code=error_code if error_code else '200',
                    error_msg=error_msg if error_msg else 'success'
                )
                if task_status == TaskStatus.WORKING:
                    LOG.info(f'{self._log_prefix(task_id)} Task started')
                elif task_status == TaskStatus.SUCCESS:
                    LOG.info(f'{self._log_prefix(task_id)} Task completed successfully')
                else:
                    LOG.error(f'{self._log_prefix(task_id)} Task completed with status {task_status}: {error_msg}')
            except Exception as e:
                LOG.error(f'{self._log_prefix(task_id)} Failed to enqueue finished task: {e}')

        def _worker_impl(self):  # noqa: C901
            while not self._shutdown:
                try:
                    task_data = self._poll_task()
                except Exception as e:
                    LOG.error(f'{self._log_prefix()} [Worker] poll_task failed: {e}, {traceback.format_exc()}')
                    time.sleep(WORKER_ERROR_RETRY_INTERVAL)
                    continue
                if task_data:
                    payload = None
                    callback_url = None
                    task_context_json = None
                    try:
                        payload = json.loads(task_data.get('message'))
                        callback_url = self._resolve_callback_url(payload) if isinstance(payload, dict) else None
                        if isinstance(payload, dict):
                            task_context_json = json.dumps(
                                self._build_task_context(task_data['task_type'], payload),
                                ensure_ascii=False,
                            )
                        self._validate_task_payload(task_data['task_type'], payload)
                    except Exception as e:
                        task_id = task_data.get('task_id')
                        task_type = task_data.get('task_type')
                        LOG.error(f'{self._log_prefix(task_id)} [Worker] Failed to parse task payload: {e}, '
                                  f'{traceback.format_exc()}')
                        if task_id and task_type:
                            try:
                                self._enqueue_finished_task(
                                    task_id=task_id,
                                    task_type=task_type,
                                    task_status=TaskStatus.FAILED,
                                    error_code=type(e).__name__,
                                    error_msg=str(e),
                                    callback_url=callback_url,
                                    task_context_json=task_context_json,
                                )
                                deleted = self._waiting_task_queue.delete(
                                    filter_by={'task_id': task_id, 'worker_id': self._worker_id}
                                )
                                if deleted == 0:
                                    LOG.warning(f'{self._log_prefix(task_id)} Failed to delete invalid task')
                            except Exception as inner_e:
                                LOG.error(f'{self._log_prefix(task_id)} Failed to cleanup invalid task: {inner_e}, '
                                          f'{traceback.format_exc()}')
                        time.sleep(WORKER_ERROR_RETRY_INTERVAL)
                        continue
                    LOG.info(f'{self._log_prefix(task_data["task_id"])} [Worker] Claimed queued task: '
                             f'{self._summarize_task_payload(task_data["task_type"], payload)}')
                    self._run_task(task_data['task_id'], task_data['task_type'], payload, from_queue=True)
                    continue

                if self._task_poller_impl is not None and self._poll_mode == 'direct':
                    try:
                        tasks = self._task_poller_impl()
                        if not tasks:
                            time.sleep(0.1)
                            continue
                        for task in tasks:
                            try:
                                task_id, task_type, payload = self._parse_task_payload(task)
                            except Exception as e:
                                LOG.warning(f'{self._log_prefix()} [Poller] Skip invalid task payload: {e}. '
                                            f'payload={task}')
                                continue
                            LOG.info(f'{self._log_prefix(task_id)} [Poller] Received direct task: '
                                     f'{self._summarize_task_payload(task_type, payload)}')
                            self._run_task(task_id, task_type, payload, from_queue=False)
                    except Exception as e:
                        LOG.error(f'{self._log_prefix()} [Poller] fetch failed: {e}')
                        time.sleep(WORKER_ERROR_RETRY_INTERVAL)
                    continue

                time.sleep(0.1)

        def start(self):
            LOG.info(f'{self._log_prefix()} Starting worker...')
            self._lazy_init()
            self._cleanup_stale_tasks()
            if self._worker_thread is not None and self._worker_thread.is_alive():
                LOG.warning(f'{self._log_prefix()} Worker thread is already running')
                return
            self._shutdown = False
            if self._task_poller_impl is not None and self._poll_mode == 'thread':
                if self._poller_thread is None or not self._poller_thread.is_alive():
                    self._poller_thread = threading.Thread(target=self._poller, daemon=True)
                    self._poller_thread.start()
            self._worker_thread = threading.Thread(target=self._worker_impl, daemon=True)
            self._worker_thread.start()
            LOG.info(f'{self._log_prefix()} Worker thread started')

        def shutdown(self):
            LOG.info(f'{self._log_prefix()} Shutting down worker...')
            self._shutdown = True
            if self._worker_thread is not None and self._worker_thread.is_alive():
                self._worker_thread.join(timeout=5.0)
                if self._worker_thread.is_alive():
                    LOG.warning(f'{self._log_prefix()} Worker thread did not stop within timeout')
                    self._fail_in_progress_task('shutdown timeout')
                else:
                    LOG.info(f'{self._log_prefix()} Worker thread stopped')
            if self._poller_thread is not None and self._poller_thread.is_alive():
                self._poller_thread.join(timeout=5.0)
                if self._poller_thread.is_alive():
                    LOG.warning(f'{self._log_prefix()} Poller thread did not stop within timeout')
                else:
                    LOG.info(f'{self._log_prefix()} Poller thread stopped')

    def __init__(self, db_config: dict = None, num_workers: int = 1, port: int = None,
                 task_poller=None, lease_duration: float = 300.0, lease_renew_interval: float = 60.0,
                 poll_mode: str = 'thread', callback_task_statuses: list[str] = None,
                 callback_task_types: list[str] = None, launcher: Optional[Launcher] = None):
        super().__init__()
        self._db_config = db_config if db_config else _get_default_db_config('doc_task_management')
        self._num_workers = num_workers
        self._port = port
        worker_impl = DocumentProcessorWorker._Impl(
            db_config=self._db_config,
            task_poller=task_poller,
            lease_duration=lease_duration,
            lease_renew_interval=lease_renew_interval,
            poll_mode=poll_mode,
            callback_task_statuses=callback_task_statuses,
            callback_task_types=callback_task_types,
        )
        # Workers are lightweight orchestration subprocesses (task queue polling,
        # callbacks); they never need GPU. Default to EmptyLauncher so they stay
        # on the same host even when the process-wide LAZYLLM_DEFAULT_LAUNCHER is
        # 'sco' (CI) -- otherwise worker.start() tries to srun and hangs waiting
        # for a slurm node. Callers can still override via the ``launcher`` arg.
        import lazyllm as _lazyllm
        effective_launcher = launcher if launcher is not None else _lazyllm.launchers.empty(sync=False)
        self._worker_impl = ServerModule(
            worker_impl, port=self._port, num_replicas=self._num_workers,
            launcher=effective_launcher,
        )
        LOG.info(f'[DocumentProcessorWorker] Worker initialized with {num_workers} workers')

    def _dispatch(self, method: str, *args, **kwargs):
        impl = self._worker_impl
        if isinstance(impl, ServerModule):
            return impl._call(method, *args, **kwargs)
        else:
            return getattr(impl, method)(*args, **kwargs)

    def start(self):
        """
Start the document processing consumer thread. This method will start the worker threads and start the service port, and subsequent documents can be processed using this service.
If the worker thread number is set to greater than 1 in the initialization, multiple worker threads will be started, otherwise only one worker thread will be started.
"""
        result = super().start()
        LOG.info('[DocumentProcessorWorker] Starting worker...')
        self._dispatch('start')
        LOG.info('[DocumentProcessorWorker] Worker started')
        return result

    def wait(self):
        impl = self._worker_impl
        if isinstance(impl, ServerModule):
            return impl.wait()
        LOG.warning('[DocumentProcessorWorker] wait() is no-op in local mode')

    def set_reader(self, reader: 'DirectoryReader'):
        """
Set the global file reader for this worker. This method is typically called automatically by ``DocumentProcessor`` after worker registration, injecting a ``DirectoryReader`` instance so the worker can read file contents when processing document-add and reparse tasks.

A reader can only be set once per worker instance. Calling this method multiple times with a different reader instance raises a ``ValueError``.

Args:
    reader (DirectoryReader): The file reader instance used to read file contents during document processing tasks.
"""
        return self._dispatch('set_reader', reader)

    def set_schema_extractors(self, schema_extractors):
        """
Set the schema extractors for this worker. This method is typically called automatically by ``DocumentProcessor`` after worker registration, injecting a set of ``SchemaExtractor`` instances so the worker can automatically extract document content according to predefined schemas and write the results to the database during document-add tasks.

This method can be called multiple times on the same worker instance; subsequent calls overwrite previously set extractors.

Args:
    schema_extractors (Dict[str, SchemaExtractor]): A dictionary of schema extractors, where keys are extractor names and values are ``SchemaExtractor`` instances.
"""
        return self._dispatch('set_schema_extractors', schema_extractors)

    def stop(self):
        LOG.info('[DocumentProcessorWorker] Stopping worker...')
        self._dispatch('shutdown')
        return super().stop()

`set_reader(reader)`

Set the global file reader for this worker. This method is typically called automatically by DocumentProcessor after worker registration, injecting a DirectoryReader instance so the worker can read file contents when processing document-add and reparse tasks.

A reader can only be set once per worker instance. Calling this method multiple times with a different reader instance raises a ValueError.

Parameters:

reader (DirectoryReader) –

The file reader instance used to read file contents during document processing tasks.

Source code in lazyllm/tools/rag/parsing_service/worker.py

    def set_reader(self, reader: 'DirectoryReader'):
        """
Set the global file reader for this worker. This method is typically called automatically by ``DocumentProcessor`` after worker registration, injecting a ``DirectoryReader`` instance so the worker can read file contents when processing document-add and reparse tasks.

A reader can only be set once per worker instance. Calling this method multiple times with a different reader instance raises a ``ValueError``.

Args:
    reader (DirectoryReader): The file reader instance used to read file contents during document processing tasks.
"""
        return self._dispatch('set_reader', reader)

`set_schema_extractors(schema_extractors)`

Set the schema extractors for this worker. This method is typically called automatically by DocumentProcessor after worker registration, injecting a set of SchemaExtractor instances so the worker can automatically extract document content according to predefined schemas and write the results to the database during document-add tasks.

This method can be called multiple times on the same worker instance; subsequent calls overwrite previously set extractors.

Parameters:

schema_extractors (Dict[str, SchemaExtractor]) –

A dictionary of schema extractors, where keys are extractor names and values are SchemaExtractor instances.

Source code in lazyllm/tools/rag/parsing_service/worker.py

    def set_schema_extractors(self, schema_extractors):
        """
Set the schema extractors for this worker. This method is typically called automatically by ``DocumentProcessor`` after worker registration, injecting a set of ``SchemaExtractor`` instances so the worker can automatically extract document content according to predefined schemas and write the results to the database during document-add tasks.

This method can be called multiple times on the same worker instance; subsequent calls overwrite previously set extractors.

Args:
    schema_extractors (Dict[str, SchemaExtractor]): A dictionary of schema extractors, where keys are extractor names and values are ``SchemaExtractor`` instances.
"""
        return self._dispatch('set_schema_extractors', schema_extractors)

`start()`

Start the document processing consumer thread. This method will start the worker threads and start the service port, and subsequent documents can be processed using this service. If the worker thread number is set to greater than 1 in the initialization, multiple worker threads will be started, otherwise only one worker thread will be started.

Source code in lazyllm/tools/rag/parsing_service/worker.py

    def start(self):
        """
Start the document processing consumer thread. This method will start the worker threads and start the service port, and subsequent documents can be processed using this service.
If the worker thread number is set to greater than 1 in the initialization, multiple worker threads will be started, otherwise only one worker thread will be started.
"""
        result = super().start()
        LOG.info('[DocumentProcessorWorker] Starting worker...')
        self._dispatch('start')
        LOG.info('[DocumentProcessorWorker] Worker started')
        return result

`lazyllm.tools.WebModule`

Bases: ModuleBase

WebModule is a web-based interactive interface provided by LazyLLM for developers. After initializing and starting a WebModule, developers can see structure of the module they provides behind the WebModule, and transmit the input of the Chatbot component to their modules. The results and logs returned by the module will be displayed on the “Processing Logs” and Chatbot component on the web page. In addition, Checkbox or Text components can be added programmatically to the web page for additional parameters to the background module. Meanwhile, The WebModule page provides Checkboxes of “Use Context,” “Stream Output,” and “Append Output,” which can be used to adjust the interaction between the page and the module behind.

Parameters:

m (Any) –

The model object to wrap, can be a lazyllm.FlowBase subclass or other callable object.
components (Dict[Any, Any], default: dict() ) –

Additional UI component configurations, defaults to empty dict.
title (str, default: '对话演示终端' ) –

Web page title, defaults to 'Dialogue Demo Terminal'.
port (Optional[Union[int, range, tuple, list]], default: None ) –

Service port number or port range, defaults to 20500-20799.
history (List[Any], default: [] ) –

List of historical session modules, defaults to empty list.
text_mode (Optional[Mode], default: None ) –

Text output mode (Dynamic/Refresh/Appendix), defaults to Dynamic.
trace_mode (Optional[Mode], default: None ) –

Deprecated trace mode parameter.
audio (bool, default: False ) –

Whether to enable audio input functionality, defaults to False.
stream (bool, default: False ) –

Whether to enable streaming output, defaults to False.
files_target (Optional[Union[Any, List[Any]]], default: None ) –

Target module for file processing, defaults to None.
static_paths (Optional[Union[str, Path, List[Union[str, Path]]]], default: None ) –

Static resource paths, defaults to None.
encode_files (bool, default: False ) –

Whether to encode file paths, defaults to False.
share (bool, default: False ) –

Whether to generate a shareable public link, defaults to False.

Examples:

>>> import lazyllm
>>> def func2(in_str, do_sample=True, temperature=0.0, *args, **kwargs):
...     return f"func2:{in_str}|do_sample:{str(do_sample)}|temp:{temperature}"
...
>>> m1=lazyllm.ActionModule(func2)
>>> m1.name="Module1"
>>> w = lazyllm.WebModule(m1, port=[20570, 20571, 20572], components={
...         m1:[('do_sample', 'Checkbox', True), ('temperature', 'Text', 0.1)]},
...                       text_mode=lazyllm.tools.WebModule.Mode.Refresh)
>>> w.start()
193703: 2024-06-07 10:26:00 lazyllm SUCCESS: ...

Source code in lazyllm/tools/webpages/webmodule.py

class WebModule(ModuleBase):
    """WebModule is a web-based interactive interface provided by LazyLLM for developers. After initializing and starting
a WebModule, developers can see structure of the module they provides behind the WebModule, and transmit the input
of the Chatbot component to their modules. The results and logs returned by the module will be displayed on the
“Processing Logs” and Chatbot component on the web page. In addition, Checkbox or Text components can be added
programmatically to the web page for additional parameters to the background module. Meanwhile, The WebModule page
provides Checkboxes of “Use Context,” “Stream Output,” and “Append Output,” which can be used to adjust the
interaction between the page and the module behind.

Args:
    m (Any): The model object to wrap, can be a lazyllm.FlowBase subclass or other callable object.
    components (Dict[Any, Any], optional): Additional UI component configurations, defaults to empty dict.
    title (str, optional): Web page title, defaults to 'Dialogue Demo Terminal'.
    port (Optional[Union[int, range, tuple, list]], optional): Service port number or port range, defaults to 20500-20799.
    history (List[Any], optional): List of historical session modules, defaults to empty list.
    text_mode (Optional[Mode], optional): Text output mode (Dynamic/Refresh/Appendix), defaults to Dynamic.
    trace_mode (Optional[Mode], optional): Deprecated trace mode parameter.
    audio (bool, optional): Whether to enable audio input functionality, defaults to False.
    stream (bool, optional): Whether to enable streaming output, defaults to False.
    files_target (Optional[Union[Any, List[Any]]], optional): Target module for file processing, defaults to None.
    static_paths (Optional[Union[str, Path, List[Union[str, Path]]]], optional): Static resource paths, defaults to None.
    encode_files (bool, optional): Whether to encode file paths, defaults to False.
    share (bool, optional): Whether to generate a shareable public link, defaults to False.


Examples:
    >>> import lazyllm
    >>> def func2(in_str, do_sample=True, temperature=0.0, *args, **kwargs):
    ...     return f"func2:{in_str}|do_sample:{str(do_sample)}|temp:{temperature}"
    ...
    >>> m1=lazyllm.ActionModule(func2)
    >>> m1.name="Module1"
    >>> w = lazyllm.WebModule(m1, port=[20570, 20571, 20572], components={
    ...         m1:[('do_sample', 'Checkbox', True), ('temperature', 'Text', 0.1)]},
    ...                       text_mode=lazyllm.tools.WebModule.Mode.Refresh)
    >>> w.start()
    193703: 2024-06-07 10:26:00 lazyllm SUCCESS: ...
    """
    class Mode:
        Dynamic = 0
        Refresh = 1
        Appendix = 2

    def __init__(self, m: Any, *, components: Dict[Any, Any] = dict(), title: str = '对话演示终端',  # noqa B008
                 port: Optional[Union[int, range, tuple, list]] = None, history: List[Any] = [],  # noqa B006
                 text_mode: Optional[Mode] = None, trace_mode: Optional[Mode] = None, audio: bool = False,
                 stream: bool = False, files_target: Optional[Union[Any, List[Any]]] = None,
                 static_paths: Optional[Union[str, Path, List[Union[str, Path]]]] = None,
                 encode_files: bool = False, share: bool = False) -> None:
        super().__init__()
        self._gradio_ver = _get_gradio_version()
        # Will be set in init_web based on actual Chatbot support
        self._use_openai_format = False
        # Set the static directory of gradio so that gradio can access local resources in the directory
        if isinstance(static_paths, (str, Path)):
            self._static_paths = [static_paths]
        elif isinstance(static_paths, list) and all(isinstance(p, (str, Path)) for p in static_paths):
            self._static_paths = static_paths
        elif static_paths is None:
            self._static_paths = []
        else:
            raise ValueError(f'static_paths only supported str, path or list types. Not supported {static_paths}')
        self.m = lazyllm.ActionModule(m) if isinstance(m, lazyllm.FlowBase) else m
        self.pool = lazyllm.ThreadPoolExecutor(max_workers=50)
        self.title = title
        self.port = port or range(20500, 20799)
        components = sum([[([k._module_id, k.name] + list(v)) for v in vs]
                         for k, vs in components.items()], [])
        self.ckeys = [[c[0], c[2]] for c in components]
        if isinstance(m, (OnlineChatModule, TrainableModule)) and not history:
            history = [m]
        self.history = [h._module_id for h in history]
        if trace_mode:
            LOG.warn('trace_mode is deprecated')
        self.text_mode = text_mode if text_mode else WebModule.Mode.Dynamic
        self.cach_path = self._set_up_caching()
        self.audio = audio
        self.stream = stream
        self.files_target = files_target if isinstance(files_target, list) or files_target is None else [files_target]
        self.encode_files = encode_files
        self.share = share
        self.demo = self.init_web(components)
        self.url = None
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)

    def _get_all_file_submodule(self):
        if self.files_target: return
        self.files_target = []
        self.for_each(
            lambda x: getattr(x, 'template_message', None),
            lambda x: self.files_target.append(x)
        )

    def _signal_handler(self, signum, frame):
        LOG.info(f'Signal {signum} received, terminating subprocess.')
        atexit._run_exitfuncs()
        sys.exit(0)

    def _set_up_caching(self):
        if 'GRADIO_TEMP_DIR' in os.environ:
            cach_path = os.environ['GRADIO_TEMP_DIR']
        else:
            cach_path = os.path.join(lazyllm.config['temp_dir'], 'gradio_cach')
            os.environ['GRADIO_TEMP_DIR'] = cach_path
        if not os.path.exists(cach_path):
            os.makedirs(cach_path)
        return cach_path

    def init_web(self, component_descs):
        """Initialize the Web UI page.
This method uses Gradio to build the interactive chat interface and binds all components to the appropriate logic. It supports session selection, streaming output, context toggling, multimodal input, and control tools. The method returns the constructed Gradio Blocks object.

Args:
    component_descs (List[Tuple]): A list of component descriptors. Each element is a 5-tuple
        (module, group_name, name, component_type, value), e.g. ('MyModule', 'GroupA', 'use_cache', 'Checkbox', True).

**Returns:**

- gr.Blocks: The constructed Gradio UI object, which can be launched via `.launch()`.
"""
        if hasattr(gr, 'set_static_paths'):
            gr.set_static_paths(self._static_paths)
        blocks_kwargs, css_to_inject = _get_blocks_kwargs(css, self.title, analytics_enabled=False)
        with gr.Blocks(**blocks_kwargs) as demo:
            # Inject CSS via HTML if not supported in Blocks parameter
            if css_to_inject:
                gr.HTML(f'<style>{css_to_inject}</style>')
            sess_data = gr.State(value={
                'sess_titles': [''],
                'sess_logs': {},
                'sess_history': {},
                'sess_num': 1,
                'curr_sess': '',
                'frozen_query': '',
            })
            with gr.Row():
                with gr.Column(scale=3):
                    with gr.Row():
                        with lazyllm.config.temp('repr_show_child', True):
                            gr.Textbox(elem_id='module', interactive=False, show_label=True,
                                       label='模型结构', value=repr(self.m))
                    with gr.Row():
                        chat_use_context = gr.Checkbox(interactive=True, value=False, label='使用上下文')
                    with gr.Row():
                        stream_output = gr.Checkbox(interactive=self.stream, value=self.stream, label='流式输出')
                        text_mode = gr.Checkbox(interactive=(self.text_mode == WebModule.Mode.Dynamic),
                                                value=(self.text_mode != WebModule.Mode.Refresh), label='追加输出')
                    components = []
                    for _, gname, name, ctype, value in component_descs:
                        if ctype in ('Checkbox', 'Text'):
                            components.append(getattr(gr, ctype)(interactive=True, value=value, label=f'{gname}.{name}'))
                        elif ctype == 'Dropdown':
                            components.append(getattr(gr, ctype)(interactive=True, choices=value,
                                                                 label=f'{gname}.{name}'))
                        else:
                            raise KeyError(f'invalid component type: {ctype}')
                    with gr.Row():
                        dbg_msg = gr.Textbox(show_label=True, label='处理日志',
                                             elem_id='logging', interactive=False, max_lines=10)
                    clear_btn = gr.Button(value='🗑️  Clear history', interactive=True)
                with gr.Column(scale=6):
                    with gr.Row():
                        add_sess_btn = gr.Button('添加新会话')
                        sess_drpdn = gr.Dropdown(choices=sess_data.value['sess_titles'], label='选择会话：', value='')
                        del_sess_btn = gr.Button('删除当前会话')
                    chatbot_kwargs, use_messages_format = _get_chatbot_kwargs(height=700, gradio_ver=self._gradio_ver)
                    self._use_openai_format = use_messages_format
                    chatbot = gr.Chatbot(**chatbot_kwargs)
                    query_box = gr.MultimodalTextbox(show_label=False, placeholder='输入内容并回车!!!', interactive=True)
                    recordor = gr.Audio(sources=['microphone'], type='filepath', visible=self.audio)

            query_box.submit(self._init_session, [query_box, sess_data, recordor],
                                                 [sess_drpdn, chatbot, dbg_msg, sess_data, recordor], queue=True
                ).then(lambda: gr.update(interactive=False), None, query_box, queue=False
                ).then(lambda: gr.update(interactive=False), None, add_sess_btn, queue=False
                ).then(lambda: gr.update(interactive=False), None, sess_drpdn, queue=False
                ).then(lambda: gr.update(interactive=False), None, del_sess_btn, queue=False
                ).then(self._prepare, [query_box, chatbot, sess_data], [query_box, chatbot], queue=True
                ).then(self._respond_stream, [chat_use_context, chatbot, stream_output, text_mode] + components,
                                             [chatbot, dbg_msg], queue=chatbot
                ).then(lambda: gr.update(interactive=True), None, query_box, queue=False
                ).then(lambda: gr.update(interactive=True), None, add_sess_btn, queue=False
                ).then(lambda: gr.update(interactive=True), None, sess_drpdn, queue=False
                ).then(lambda: gr.update(interactive=True), None, del_sess_btn, queue=False)
            clear_btn.click(self._clear_history, [sess_data], outputs=[chatbot, query_box, dbg_msg, sess_data])

            sess_drpdn.change(self._change_session, [sess_drpdn, chatbot, dbg_msg, sess_data],
                                                    [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            add_sess_btn.click(self._add_session, [chatbot, dbg_msg, sess_data],
                                                  [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            del_sess_btn.click(self._delete_session, [sess_drpdn, sess_data],
                                                     [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            recordor.change(self._sub_audio, recordor, query_box)
            return demo

    def _sub_audio(self, audio):
        if audio:
            return {'text': '', 'files': [audio]}
        else:
            return {}

    def _init_session(self, query, session, audio):
        audio = None
        session['frozen_query'] = query
        if session['curr_sess'] != '':  # remain unchanged.
            return gr.Dropdown(), gr.Chatbot(), gr.Textbox(), session, audio

        if 'text' in query and query['text'] is not None:
            id_name = query['text']
        else:
            id_name = id(query)
        session['curr_sess'] = f'({session["sess_num"]})  {id_name}'
        session['sess_num'] += 1
        session['sess_titles'][0] = session['curr_sess']

        session['sess_logs'][session['curr_sess']] = []
        session['sess_history'][session['curr_sess']] = []
        return gr.update(choices=session['sess_titles'], value=session['curr_sess']), [], '', session, audio

    def _add_session(self, chat_history, log_history, session):
        if session['curr_sess'] == '':
            LOG.warning('Cannot create new session while current session is empty.')
            return gr.Dropdown(), gr.Chatbot(), {}, gr.Textbox(), session

        self._save_history(chat_history, log_history, session)

        session['curr_sess'] = ''
        session['sess_titles'].insert(0, session['curr_sess'])
        return gr.update(choices=session['sess_titles'], value=session['curr_sess']), [], {}, '', session

    def _save_history(self, chat_history, log_history, session):
        if session['curr_sess'] in session['sess_titles']:
            session['sess_history'][session['curr_sess']] = chat_history
            session['sess_logs'][session['curr_sess']] = log_history

    def _change_session(self, session_title, chat_history, log_history, session):
        if session['curr_sess'] == '':  # new session
            return gr.Dropdown(), [], {}, '', session

        if session_title not in session['sess_titles']:
            LOG.warning(f'{session_title} is not an existing session title.')
            return gr.Dropdown(), gr.Chatbot(), {}, gr.Textbox(), session

        self._save_history(chat_history, log_history, session)

        session['curr_sess'] = session_title
        return (gr.update(choices=session['sess_titles'], value=session['curr_sess']),
                session['sess_history'][session['curr_sess']], {},
                session['sess_logs'][session['curr_sess']], session)

    def _delete_session(self, session_title, session):
        if session_title not in session['sess_titles']:
            LOG.warning(f'session {session_title} does not exist.')
            return gr.Dropdown(), session
        session['sess_titles'].remove(session_title)

        if session_title != '':
            del session['sess_history'][session_title]
            del session['sess_logs'][session_title]
            session['curr_sess'] = session_title
        else:
            session['curr_sess'] = 'dummy session'
            # add_session and change_session cannot accept an uninitialized session.
            # Here we need to imitate removal of a real session so that
            # add_session and change_session could skip saving chat history.

        if len(session['sess_titles']) == 0:
            return self._add_session(None, None, session)
        else:
            return self._change_session(session['sess_titles'][0], None, {}, session)

    def _prepare(self, query, chat_history, session):  # noqa: C901
        is_empty = False
        if isinstance(query, dict):
            is_empty = not query.get('text', '') and not query.get('files', [])
        elif isinstance(query, list):
            is_empty = len(query) == 0

        if is_empty:
            query = session['frozen_query']

        if chat_history is None:
            chat_history = []

        if self._use_openai_format:
            # --- Gradio 6.0+ ---
            content = []
            if isinstance(query, dict) and 'text' in query:
                for x in query.get('files', []):
                    content.append({'type': 'file', 'path': x})
                if query.get('text'):
                    content.append({'type': 'text', 'text': query['text']})
            elif isinstance(query, list):
                content = query

            if content:
                chat_history.append({'role': 'user', 'content': content})
        else:
            # --- Gradio 5.0- ---
            for x in query.get('files', []):
                if isinstance(x, dict):
                    file_path = x.get('path') or x.get('name')
                elif isinstance(x, str):
                    file_path = x
                else:
                    file_path = str(x)
                if file_path:
                    chat_history.append([[file_path], None])
            if query.get('text'):
                chat_history.append([query['text'], None])

        return {}, chat_history

    def _respond_stream(self, use_context, chat_history, stream_output, append_text, *args):  # noqa C901
        try:
            # TODO: move context to trainable module
            files = []
            log_history = []
            # Initialize current answer based on format
            if self._use_openai_format:
                if not chat_history or chat_history[-1]['role'] != 'assistant':
                    chat_history.append({'role': 'assistant', 'content': ''})
                curr_ans = chat_history[-1]
            else:
                chat_history[-1][1] = ''
                curr_ans = chat_history[-1]

            # Extract files based on context mode and format
            if self._use_openai_format:
                # Gradio 6.0+ OpenAI format
                search_history = chat_history[::-1] if use_context else []
                if not use_context:
                    for h in chat_history[::-1]:
                        search_history.append(h)
                        if h.get('role') == 'user':
                            break
                for h in search_history:
                    if isinstance(h, dict) and h.get('role') == 'user':
                        content = h.get('content', [])
                        if isinstance(content, list):
                            for item in content:
                                if isinstance(item, dict) and item.get('type') in ['file', 'image']:
                                    path = None
                                    if 'file' in item and isinstance(item['file'], dict):
                                        path = item['file'].get('path')
                                    elif 'path' in item:
                                        path = item.get('path')
                                    elif 'value' in item:
                                        path = item.get('value')
                                    if isinstance(path, dict):
                                        path = path.get('path')
                                    if path:
                                        files.append(path)
            else:
                # Gradio 5.x tuple format
                if use_context:
                    for file in chat_history[::-1]:
                        if isinstance(file[0], (tuple, list)):
                            files.append(file[0][0])
                        elif isinstance(file[0], str) and file[0].startswith('lazyllm_img::'):
                            files.append(file[0][13:])
                else:
                    for file in chat_history[::-1]:
                        if file[-1]:
                            break
                        if isinstance(file[0], (tuple, list)):
                            files.append(file[0][0])
                        elif isinstance(file[0], str) and file[0].startswith('lazyllm_img::'):
                            files.append(file[0][13:])
            files = [f for f in files if f is not None]

            # Extract current input string
            if self._use_openai_format:
                last_user = next((h for h in chat_history[::-1]
                                 if isinstance(h, dict) and h.get('role') == 'user'), None)
                if last_user:
                    content = last_user['content']
                    if isinstance(content, list):
                        string = next((c['text'] for c in content
                                      if isinstance(c, dict) and c.get('type') == 'text'), '')
                    else:
                        string = content
                else:
                    string = ''
            else:
                if isinstance(chat_history[-1][0], str):
                    string = chat_history[-1][0]
                else:
                    string = ''
            input = string

            if self.files_target is None and not self.encode_files:
                self._get_all_file_submodule()

            if self.encode_files and files:
                input = encode_query_with_filepaths(string, files)
                string = input

            if files and self.files_target:
                for module in self.files_target:
                    assert isinstance(module, ModuleBase)
                    if module._module_id in globals['lazyllm_files']:
                        globals['lazyllm_files'][module._module_id].extend(files)
                    else:
                        globals['lazyllm_files'][module._module_id] = files

                attachment_msg = f' ## Get attachments: {os.path.basename(files[-1])}'
                input += attachment_msg

            elif self.files_target:
                for module in self.files_target:
                    assert isinstance(module, ModuleBase)
                    globals['lazyllm_files'][module._module_id] = []

            # Filter out file-only entries from history and ensure all user messages are strings
            if use_context and len(chat_history) > 1:
                history = []
                for h in chat_history[:-1]:
                    if isinstance(h, (list, tuple)):
                        if isinstance(h[0], (list, tuple)): continue
                        if h[0] and h[1]:
                            # Remove <think>...</think> tags from assistant response
                            clean_response = re.sub(r'<think>.*?</think>\n*', '', h[1], flags=re.DOTALL)
                            history.append([h[0], clean_response])
                    elif isinstance(h, dict):
                        # Gradio 6.0+ format
                        if h.get('role') == 'user':
                            content = h.get('content', '')
                            if isinstance(content, str):
                                user_text = content
                            elif isinstance(content, list):
                                user_text = ' '.join([
                                    item.get('text', '') for item in content
                                    if isinstance(item, dict) and item.get('type') == 'text'
                                ])
                            else:
                                user_text = ''
                            if user_text:
                                history.append([user_text, ''])
                        elif h.get('role') == 'assistant' and history:
                            content = h.get('content', '')
                            if isinstance(content, str):
                                response_text = content
                            elif isinstance(content, list):
                                text_parts = [item.get('text', '') for item in content
                                              if isinstance(item, dict) and item.get('type') == 'text']
                                response_text = ' '.join(text_parts)
                            else:
                                response_text = ''

                            clean_response = re.sub(r'<think>.*?</think>\n*', '', response_text, flags=re.DOTALL)
                            if clean_response:
                                history[-1][1] = clean_response
                            else:
                                history.pop()
            else:
                history = list()

            for k, v in zip(self.ckeys, args):
                if k[0] not in globals['global_parameters']: globals['global_parameters'][k[0]] = dict()
                globals['global_parameters'][k[0]][k[1]] = v

            if use_context:
                for h in self.history:
                    if h not in globals['chat_history']: globals['chat_history'][h] = list()
                    valid_history = [item for item in history if len(item) == 2 and item[1]]
                    globals['chat_history'][h] = valid_history

            if FileSystemQueue().size() > 0: FileSystemQueue().clear()
            kw = dict(stream_output=stream_output, lazyllm_files=files)\
                if isinstance(self.m, (TrainableModule, OnlineChatModule)) else {}
            LOG.info(f'get input: {input} and kw: {kw}')
            func_future = self.pool.submit(self.m, input, **kw)
            while True:
                if value := FileSystemQueue().dequeue():
                    delta = ''.join(value)
                    if self._use_openai_format:
                        curr_ans['content'] += delta
                    else:
                        curr_ans[1] += delta
                    if stream_output: yield chat_history, ''
                elif value := FileSystemQueue.get_instance('lazy_error').dequeue():
                    log_history.append(''.join(value))
                elif value := FileSystemQueue.get_instance('lazy_trace').dequeue():
                    log_history.append(''.join(value))
                elif func_future.done(): break
                time.sleep(0.01)
            result = func_future.result()
            if FileSystemQueue().size() > 0: FileSystemQueue().clear()

            def get_log_and_message(s):
                if isinstance(s, dict):
                    s = s.get('message', {}).get('content', '')
                else:
                    try:
                        r = decode_query_with_filepaths(s)
                        if isinstance(r, str): r = json.loads(r)
                        if 'choices' in r:
                            if 'type' not in r['choices'][0] or (
                                    'type' in r['choices'][0] and r['choices'][0]['type'] != 'tool_calls'):
                                delta = r['choices'][0]['delta']
                                s = delta.get('content', '')
                        elif isinstance(r, dict) and 'files' in r and 'query' in r:
                            return r['query'], ''.join(log_history), r['files'] if len(r['files']) > 0 else None
                    except (ValueError, KeyError, TypeError): pass
                    except Exception as e:
                        LOG.error(f'Uncaptured error `{e}` when parsing `{s}`')
                return s, ''.join(log_history), None

            def contains_markdown_image(text: str):
                return bool(re.search(r'!\[.*?\]\((.*?)\)', text))

            def extract_img_path(text: str):
                return re.findall(r'!\[.*?\]\((.*?)\)', text)

            file_paths = None
            if isinstance(result, (str, dict)):
                result, log, file_paths = get_log_and_message(result)
            if file_paths:
                for i, file_path in enumerate(file_paths):
                    suffix = os.path.splitext(file_path)[-1].lower()
                    if i == 0:
                        if self._use_openai_format:
                            curr_ans['content'] = [{'type': 'file', 'path': file_path}]
                        else:
                            is_image = suffix in PIL.Image.registered_extensions()
                            curr_ans[1] = gr.Image(file_path) if is_image else file_path
                    else:
                        if self._use_openai_format:
                            chat_history.append({'role': 'assistant', 'content': [{'type': 'file', 'path': file_path}]})
                        else:
                            chat_history.append([None, file_path])
                if result:
                    if self._use_openai_format:
                        chat_history.append({'role': 'assistant', 'content': result})
                    else:
                        chat_history.append([None, result])
            else:
                assert isinstance(result, str), f'Result should only be str, but got {type(result)}'
                show_result = result
                if contains_markdown_image(show_result):
                    for url in extract_img_path(show_result):
                        b64_encoded = None
                        suffix = os.path.splitext(url)[-1].lower()
                        if suffix and suffix not in PIL.Image.registered_extensions(): continue
                        suffix = suffix.lstrip('.') if suffix else 'jpeg'
                        try:
                            if url.startswith(('http://', 'https://')):
                                resp = requests.get(url, timeout=5)
                                if resp.status_code == 200:
                                    b64_encoded = base64.b64encode(resp.content).decode('utf-8')
                            elif os.path.exists(url):
                                with open(url, 'rb') as f:
                                    b64_encoded = base64.b64encode(f.read()).decode('utf-8')
                            if b64_encoded:
                                show_result = show_result.replace(url, f'data:image/{suffix};base64,{b64_encoded}')
                        except Exception: pass

                if result:
                    if self._use_openai_format:
                        if isinstance(curr_ans['content'], list):
                            curr_ans['content'].append({'type': 'text', 'text': show_result})
                        else:
                            curr_ans['content'] = show_result
                    else:
                        if stream_output and curr_ans[1]:
                            if show_result != result:
                                curr_ans[1] = curr_ans[1].replace(result, show_result)
                        else:
                            match = re.search(r'(\n+)$', result)
                            count = (len(match.group(1)) if match else 0) + len(result) + 1
                            if not curr_ans[1]:
                                curr_ans[1] = show_result
                            elif not (result in curr_ans[1][-count:]):
                                curr_ans[1] += '\n\n' + show_result
                            elif show_result != result:
                                curr_ans[1] = curr_ans[1].replace(result, show_result)
        except Exception as e:
            chat_history = None
            log = f'{str(e)}\n--- traceback ---\n{traceback.format_exc()}'
            LOG.error(log)
        globals['chat_history'].clear()
        yield chat_history, log

    def _clear_history(self, session):
        session['sess_history'][session['curr_sess']] = []
        session['sess_logs'][session['curr_sess']] = []
        return [], {}, '', session

    def _work(self):
        if isinstance(self.port, (range, tuple, list)):
            port = self._find_can_use_network_port()
        else:
            port = self.port
            assert self._verify_port_access(port), f'port {port} is occupied'

        self.url = f'http://127.0.0.1:{port}'
        self.broadcast_url = f'http://0.0.0.0:{port}'

        self.demo.queue().launch(server_name='0.0.0.0', server_port=port, prevent_thread_lock=True, share=self.share)
        LOG.success('LazyLLM webmodule launched successfully: Running on: '
                    f'{self.broadcast_url}, local URL: {self.url}')

    def _update(self, *, mode=None, recursive=True):
        super(__class__, self)._update(mode=mode, recursive=recursive)
        self._work()
        return self

    def wait(self):
        """Block the main thread until the web interface is closed.
This method blocks the current thread until the Gradio demo is closed. Useful in deployment scenarios to prevent premature program exit.
"""
        self.demo.block_thread()

    def stop(self):
        """Stop the web interface and clean up resources.
If the web demo has been initialized, this method closes the Gradio demo, frees related resources, and resets `demo` and `url` attributes.
"""
        if self.demo:
            self.demo.close()
            del self.demo
            self.demo, self.url = None, ''

    @property
    def status(self):
        return 'running' if self.url else 'waiting' if self.url is None else 'Cancelled'

    def __repr__(self):
        return lazyllm.make_repr('Module', 'Web', name=self.name, subs=[repr(self.m)])

    def _find_can_use_network_port(self):
        for port in self.port:
            if self._verify_port_access(port):
                return port
        raise RuntimeError(
            f'The ports in the range {self.port} are all occupied. '
            'Please change the port range or release the relevant ports.'
        )

    def _verify_port_access(self, port):
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            result = s.connect_ex(('127.0.0.1', port))
            return result != 0

`init_web(component_descs)`

Initialize the Web UI page. This method uses Gradio to build the interactive chat interface and binds all components to the appropriate logic. It supports session selection, streaming output, context toggling, multimodal input, and control tools. The method returns the constructed Gradio Blocks object.

Parameters:

component_descs (List[Tuple]) –

A list of component descriptors. Each element is a 5-tuple (module, group_name, name, component_type, value), e.g. ('MyModule', 'GroupA', 'use_cache', 'Checkbox', True).

Returns:

gr.Blocks: The constructed Gradio UI object, which can be launched via .launch().

Source code in lazyllm/tools/webpages/webmodule.py

    def init_web(self, component_descs):
        """Initialize the Web UI page.
This method uses Gradio to build the interactive chat interface and binds all components to the appropriate logic. It supports session selection, streaming output, context toggling, multimodal input, and control tools. The method returns the constructed Gradio Blocks object.

Args:
    component_descs (List[Tuple]): A list of component descriptors. Each element is a 5-tuple
        (module, group_name, name, component_type, value), e.g. ('MyModule', 'GroupA', 'use_cache', 'Checkbox', True).

**Returns:**

- gr.Blocks: The constructed Gradio UI object, which can be launched via `.launch()`.
"""
        if hasattr(gr, 'set_static_paths'):
            gr.set_static_paths(self._static_paths)
        blocks_kwargs, css_to_inject = _get_blocks_kwargs(css, self.title, analytics_enabled=False)
        with gr.Blocks(**blocks_kwargs) as demo:
            # Inject CSS via HTML if not supported in Blocks parameter
            if css_to_inject:
                gr.HTML(f'<style>{css_to_inject}</style>')
            sess_data = gr.State(value={
                'sess_titles': [''],
                'sess_logs': {},
                'sess_history': {},
                'sess_num': 1,
                'curr_sess': '',
                'frozen_query': '',
            })
            with gr.Row():
                with gr.Column(scale=3):
                    with gr.Row():
                        with lazyllm.config.temp('repr_show_child', True):
                            gr.Textbox(elem_id='module', interactive=False, show_label=True,
                                       label='模型结构', value=repr(self.m))
                    with gr.Row():
                        chat_use_context = gr.Checkbox(interactive=True, value=False, label='使用上下文')
                    with gr.Row():
                        stream_output = gr.Checkbox(interactive=self.stream, value=self.stream, label='流式输出')
                        text_mode = gr.Checkbox(interactive=(self.text_mode == WebModule.Mode.Dynamic),
                                                value=(self.text_mode != WebModule.Mode.Refresh), label='追加输出')
                    components = []
                    for _, gname, name, ctype, value in component_descs:
                        if ctype in ('Checkbox', 'Text'):
                            components.append(getattr(gr, ctype)(interactive=True, value=value, label=f'{gname}.{name}'))
                        elif ctype == 'Dropdown':
                            components.append(getattr(gr, ctype)(interactive=True, choices=value,
                                                                 label=f'{gname}.{name}'))
                        else:
                            raise KeyError(f'invalid component type: {ctype}')
                    with gr.Row():
                        dbg_msg = gr.Textbox(show_label=True, label='处理日志',
                                             elem_id='logging', interactive=False, max_lines=10)
                    clear_btn = gr.Button(value='🗑️  Clear history', interactive=True)
                with gr.Column(scale=6):
                    with gr.Row():
                        add_sess_btn = gr.Button('添加新会话')
                        sess_drpdn = gr.Dropdown(choices=sess_data.value['sess_titles'], label='选择会话：', value='')
                        del_sess_btn = gr.Button('删除当前会话')
                    chatbot_kwargs, use_messages_format = _get_chatbot_kwargs(height=700, gradio_ver=self._gradio_ver)
                    self._use_openai_format = use_messages_format
                    chatbot = gr.Chatbot(**chatbot_kwargs)
                    query_box = gr.MultimodalTextbox(show_label=False, placeholder='输入内容并回车!!!', interactive=True)
                    recordor = gr.Audio(sources=['microphone'], type='filepath', visible=self.audio)

            query_box.submit(self._init_session, [query_box, sess_data, recordor],
                                                 [sess_drpdn, chatbot, dbg_msg, sess_data, recordor], queue=True
                ).then(lambda: gr.update(interactive=False), None, query_box, queue=False
                ).then(lambda: gr.update(interactive=False), None, add_sess_btn, queue=False
                ).then(lambda: gr.update(interactive=False), None, sess_drpdn, queue=False
                ).then(lambda: gr.update(interactive=False), None, del_sess_btn, queue=False
                ).then(self._prepare, [query_box, chatbot, sess_data], [query_box, chatbot], queue=True
                ).then(self._respond_stream, [chat_use_context, chatbot, stream_output, text_mode] + components,
                                             [chatbot, dbg_msg], queue=chatbot
                ).then(lambda: gr.update(interactive=True), None, query_box, queue=False
                ).then(lambda: gr.update(interactive=True), None, add_sess_btn, queue=False
                ).then(lambda: gr.update(interactive=True), None, sess_drpdn, queue=False
                ).then(lambda: gr.update(interactive=True), None, del_sess_btn, queue=False)
            clear_btn.click(self._clear_history, [sess_data], outputs=[chatbot, query_box, dbg_msg, sess_data])

            sess_drpdn.change(self._change_session, [sess_drpdn, chatbot, dbg_msg, sess_data],
                                                    [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            add_sess_btn.click(self._add_session, [chatbot, dbg_msg, sess_data],
                                                  [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            del_sess_btn.click(self._delete_session, [sess_drpdn, sess_data],
                                                     [sess_drpdn, chatbot, query_box, dbg_msg, sess_data])
            recordor.change(self._sub_audio, recordor, query_box)
            return demo

`stop()`

Stop the web interface and clean up resources. If the web demo has been initialized, this method closes the Gradio demo, frees related resources, and resets demo and url attributes.

Source code in lazyllm/tools/webpages/webmodule.py

    def stop(self):
        """Stop the web interface and clean up resources.
If the web demo has been initialized, this method closes the Gradio demo, frees related resources, and resets `demo` and `url` attributes.
"""
        if self.demo:
            self.demo.close()
            del self.demo
            self.demo, self.url = None, ''

`wait()`

Block the main thread until the web interface is closed. This method blocks the current thread until the Gradio demo is closed. Useful in deployment scenarios to prevent premature program exit.

Source code in lazyllm/tools/webpages/webmodule.py

    def wait(self):
        """Block the main thread until the web interface is closed.
This method blocks the current thread until the Gradio demo is closed. Useful in deployment scenarios to prevent premature program exit.
"""
        self.demo.block_thread()

`lazyllm.tools.CodeGenerator`

Bases: ModuleBase

Code Generation Module.

This module generates code based on a user-defined prompt. It automatically selects a Chinese or English system prompt based on the input, and extracts Python code snippets from the output.

__init__(self, base_model, prompt="") Initializes the code generator with a base model and prompt.

Parameters:

base_model (Union[str, TrainableModule, OnlineChatModuleBase]) –

A path string to load the model, or an initialized model instance.
prompt (str, default: '' ) –

A user-defined prompt to guide the code generation. May contain Chinese or English.

Examples:

>>> from lazyllm.components import CodeGenerator
>>> generator = CodeGenerator(base_model="deepseek-coder", prompt="写一个Python函数，计算斐波那契数列。")
>>> result = generator("请给出实现代码")
>>> print(result)
... def fibonacci(n):
...     if n <= 1:
...         return n
...     return fibonacci(n-1) + fibonacci(n-2)

Source code in lazyllm/tools/actors/code_generator.py

class CodeGenerator(ModuleBase):
    """Code Generation Module.

This module generates code based on a user-defined prompt. It automatically selects a Chinese or English system prompt based on the input, and extracts Python code snippets from the output.

`__init__(self, base_model, prompt="")`
Initializes the code generator with a base model and prompt.

Args:
    base_model (Union[str, TrainableModule, OnlineChatModuleBase]): A path string to load the model, or an initialized model instance.
    prompt (str): A user-defined prompt to guide the code generation. May contain Chinese or English.


Examples:
    >>> from lazyllm.components import CodeGenerator
    >>> generator = CodeGenerator(base_model="deepseek-coder", prompt="写一个Python函数，计算斐波那契数列。")
    >>> result = generator("请给出实现代码")
    >>> print(result)
    ... def fibonacci(n):
    ...     if n <= 1:
    ...         return n
    ...     return fibonacci(n-1) + fibonacci(n-2)
    """
    def __init__(
        self,
        base_model: Union[str, TrainableModule, OnlineChatModuleBase],
        prompt: str = '',
    ):
        super().__init__()
        self._prompt = self.choose_prompt(prompt).format(prompt=prompt)
        if isinstance(base_model, str):
            self._m = TrainableModule(base_model).start().prompt(self._prompt)
        else:
            self._m = base_model.share(self._prompt)

    def choose_prompt(self, prompt: str):
        """Selects an appropriate code generation prompt template based on the content of the input prompt.
Returns the Chinese prompt template if Chinese characters are detected; otherwise returns the English prompt template.

Args:
    prompt (str): Input prompt text.

**Returns:**

- str: The selected code generation prompt template string.
"""
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if '\u4e00' <= ele <= '\u9fff':
                return ch_code_generate_prompt
        return en_code_generate_prompt

    def forward(self, *args, **kw):
        res = self._m(*args, **kw)
        pattern = r'```python(.*?)\n```'
        matches = re.findall(pattern, res, re.DOTALL)
        if len(matches) > 0:
            return matches[0]
        return res

`choose_prompt(prompt)`

Selects an appropriate code generation prompt template based on the content of the input prompt. Returns the Chinese prompt template if Chinese characters are detected; otherwise returns the English prompt template.

Parameters:

prompt (str) –

Input prompt text.

Returns:

str: The selected code generation prompt template string.

Source code in lazyllm/tools/actors/code_generator.py

    def choose_prompt(self, prompt: str):
        """Selects an appropriate code generation prompt template based on the content of the input prompt.
Returns the Chinese prompt template if Chinese characters are detected; otherwise returns the English prompt template.

Args:
    prompt (str): Input prompt text.

**Returns:**

- str: The selected code generation prompt template string.
"""
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if '\u4e00' <= ele <= '\u9fff':
                return ch_code_generate_prompt
        return en_code_generate_prompt

`lazyllm.tools.ParameterExtractor`

Bases: ModuleBase

Parameter Extraction Module.

This module extracts structured parameters from a given text using a language model, based on the parameter names, types, descriptions, and whether they are required.

__init__(self, base_model, param, type, description, require) Initializes the parameter extractor with the parameter specification and base model.

Parameters:

base_model (Union[str, TrainableModule, OnlineChatModuleBase]) –

A model path or model instance used for extraction.
param (list[str]) –

List of parameter names to extract.
type (list[str]) –

List of parameter types (e.g., "int", "str", "bool").
description (list[str]) –

List of descriptions for each parameter.
require (list[bool]) –

List indicating whether each parameter is required.

Examples:

>>> from lazyllm.components import ParameterExtractor
>>> extractor = ParameterExtractor(
...     base_model="deepseek-chat",
...     param=["name", "age"],
...     type=["str", "int"],
...     description=["The user's name", "The user's age"],
...     require=[True, True]
... )
>>> result = extractor("My name is Alice and I am 25 years old.")
>>> print(result)
... ['Alice', 25]

Source code in lazyllm/tools/actors/parameter_extractor.py

class ParameterExtractor(ModuleBase):
    """Parameter Extraction Module.

This module extracts structured parameters from a given text using a language model, based on the parameter names, types, descriptions, and whether they are required.

`__init__(self, base_model, param, type, description, require)`
Initializes the parameter extractor with the parameter specification and base model.

Args:
    base_model (Union[str, TrainableModule, OnlineChatModuleBase]): A model path or model instance used for extraction.
    param (list[str]): List of parameter names to extract.
    type (list[str]): List of parameter types (e.g., "int", "str", "bool").
    description (list[str]): List of descriptions for each parameter.
    require (list[bool]): List indicating whether each parameter is required.


Examples:
    >>> from lazyllm.components import ParameterExtractor
    >>> extractor = ParameterExtractor(
    ...     base_model="deepseek-chat",
    ...     param=["name", "age"],
    ...     type=["str", "int"],
    ...     description=["The user's name", "The user's age"],
    ...     require=[True, True]
    ... )
    >>> result = extractor("My name is Alice and I am 25 years old.")
    >>> print(result)
    ... ['Alice', 25]
    """
    type_map = {
        int.__name__: int,
        str.__name__: str,
        float.__name__: float,
        bool.__name__: bool,
        list.__name__: list,
        dict.__name__: dict,
    }

    def __init__(
        self,
        base_model: Union[str, TrainableModule, OnlineChatModuleBase],
        param: list[str],
        type: list[str],
        description: list[str],
        require: list[bool],
    ):
        super().__init__()
        assert len(param) == len(type) == len(description) == len(require) > 0
        self._param_dict = {p: ParameterExtractor.type_map[t] for p, t in zip(param, type)}
        param_prompt = repr([dict(name=p, type=t, description=d, require=r)
                             for p, t, d, r in zip(param, type, description, require)])
        self._prompt = self.choose_prompt(param_prompt).format(prompt=param_prompt)
        if isinstance(base_model, str):
            self._m = TrainableModule(base_model).start().prompt(self._prompt)
        else:
            self._m = base_model.share(self._prompt)

    def choose_prompt(self, prompt: str):
        """
Selects the appropriate prompt template based on the content of the parameter descriptions.

This method checks whether the input parameter description string contains any Chinese characters:

- If Chinese characters are present, returns the Chinese prompt template `ch_parameter_extractor_prompt`.
- Otherwise, returns the English prompt template `en_parameter_extractor_prompt`.

Args:
    prompt (str): Parameter description string used to determine whether to use the Chinese or English prompt template.

**Returns:**

- str: Prompt template in the corresponding language.
"""
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if '\u4e00' <= ele <= '\u9fff':
                return ch_parameter_extractor_prompt
        return en_parameter_extractor_prompt

    def check_int_value(self, res: dict):
        """Check and convert integer values.

Ensure integer parameter values are correctly converted to int type.

Args:
    res (dict): Dictionary containing parameter values
"""
        for param_name in self._param_dict:
            if self._param_dict[param_name] == int.__name__ and param_name in res:
                if not isinstance(res[param_name], int):
                    try:
                        t = res[param_name]
                        res[param_name] = int(t)
                    except (ValueError, TypeError):
                        pass

    def forward(self, *args, **kw):
        res = self._m(*args, **kw)
        pattern = r'```json(.*?)\n```'
        matches = re.findall(pattern, res, re.DOTALL)
        if len(matches) > 0:
            res = matches[0]
            res.strip()
            try:
                res = json.loads(res)
            except Exception:
                pass
        else:
            res = res.split('\n')
            for param in res:
                try:
                    res = json.loads(param)
                except Exception:
                    continue
                if isinstance(res, dict): break
        if isinstance(res, dict):
            self.check_int_value(res)
            ret = [res.get(param_name, None) for param_name in self._param_dict]
        else:
            ret = [None] * len(self._param_dict)
        ret = package(ret)
        return ret

`check_int_value(res)`

Check and convert integer values.

Ensure integer parameter values are correctly converted to int type.

Parameters:

res (dict) –

Dictionary containing parameter values

Source code in lazyllm/tools/actors/parameter_extractor.py

    def check_int_value(self, res: dict):
        """Check and convert integer values.

Ensure integer parameter values are correctly converted to int type.

Args:
    res (dict): Dictionary containing parameter values
"""
        for param_name in self._param_dict:
            if self._param_dict[param_name] == int.__name__ and param_name in res:
                if not isinstance(res[param_name], int):
                    try:
                        t = res[param_name]
                        res[param_name] = int(t)
                    except (ValueError, TypeError):
                        pass

`choose_prompt(prompt)`

Selects the appropriate prompt template based on the content of the parameter descriptions.

This method checks whether the input parameter description string contains any Chinese characters:

If Chinese characters are present, returns the Chinese prompt template ch_parameter_extractor_prompt.
Otherwise, returns the English prompt template en_parameter_extractor_prompt.

Parameters:

prompt (str) –

Parameter description string used to determine whether to use the Chinese or English prompt template.

Returns:

str: Prompt template in the corresponding language.

Source code in lazyllm/tools/actors/parameter_extractor.py

    def choose_prompt(self, prompt: str):
        """
Selects the appropriate prompt template based on the content of the parameter descriptions.

This method checks whether the input parameter description string contains any Chinese characters:

- If Chinese characters are present, returns the Chinese prompt template `ch_parameter_extractor_prompt`.
- Otherwise, returns the English prompt template `en_parameter_extractor_prompt`.

Args:
    prompt (str): Parameter description string used to determine whether to use the Chinese or English prompt template.

**Returns:**

- str: Prompt template in the corresponding language.
"""
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if '\u4e00' <= ele <= '\u9fff':
                return ch_parameter_extractor_prompt
        return en_parameter_extractor_prompt

`lazyllm.tools.QustionRewrite`

Bases: ModuleBase

Question Rewrite Module.

This module rewrites or reformulates a user query using a language model. It supports both string and list output formats based on the formatter.

__init__(self, base_model, rewrite_prompt="", formatter="str") Initializes the question rewrite module with a prompt and model.

Parameters:

base_model (Union[str, TrainableModule, OnlineChatModuleBase]) –

A path string or initialized model for question rewriting.
rewrite_prompt (str, default: '' ) –

Custom prompt to guide the rewrite behavior.
formatter (str, default: 'str' ) –

Output format type; either "str" or "list".

Examples:

>>> from lazyllm.components import QustionRewrite
>>> rewriter = QustionRewrite(base_model="chatglm", rewrite_prompt="请将问题改写为更适合检索的形式", formatter="list")
>>> result = rewriter("中国的最高山峰是什么？")
>>> print(result)
... ['中国的最高山峰是哪一座？', '中国海拔最高的山是什么？']

Source code in lazyllm/tools/actors/qustion_rewrite.py

class QustionRewrite(ModuleBase):
    """Question Rewrite Module.

This module rewrites or reformulates a user query using a language model. It supports both string and list output formats based on the formatter.

`__init__(self, base_model, rewrite_prompt="", formatter="str")`
Initializes the question rewrite module with a prompt and model.

Args:
    base_model (Union[str, TrainableModule, OnlineChatModuleBase]): A path string or initialized model for question rewriting.
    rewrite_prompt (str): Custom prompt to guide the rewrite behavior.
    formatter (str): Output format type; either "str" or "list".


Examples:
    >>> from lazyllm.components import QustionRewrite
    >>> rewriter = QustionRewrite(base_model="chatglm", rewrite_prompt="请将问题改写为更适合检索的形式", formatter="list")
    >>> result = rewriter("中国的最高山峰是什么？")
    >>> print(result)
    ... ['中国的最高山峰是哪一座？', '中国海拔最高的山是什么？']
    """
    def __init__(
        self,
        base_model: Union[str, TrainableModule, OnlineChatModuleBase],
        rewrite_prompt: str = "",
        formatter: str = "str",
    ):
        super().__init__()
        self._prompt = self.choose_prompt(rewrite_prompt).format(prompt=rewrite_prompt)
        if isinstance(base_model, str):
            self._m = TrainableModule(base_model).start().prompt(self._prompt)
        else:
            self._m = base_model.share(self._prompt)
        self.formatter = formatter

    def choose_prompt(self, prompt: str):
        """
Choose the appropriate prompt template based on the language of the input prompt.

This method analyzes the input prompt string and determines whether to use the Chinese or English prompt template. It checks each character in the prompt string and if any character falls within the Chinese Unicode range (\\u4e00-\\u9fff), it returns the Chinese prompt template; otherwise, it returns the English prompt template.

Args:
    prompt (str): The input prompt string to be analyzed for language detection.

**Returns:**

- str: The selected prompt template string (either Chinese or English version).


Examples:

    >>> from lazyllm.tools.actors.qustion_rewrite import QustionRewrite

    # Example 1: English prompt (no Chinese characters)
    >>> rewriter = QustionRewrite("gpt-3.5-turbo")
    >>> prompt_template = rewriter.choose_prompt("How to implement machine learning?")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: False

    # Example 2: Chinese prompt (contains Chinese characters)
    >>> prompt_template = rewriter.choose_prompt("如何实现机器学习？")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: True

    # Example 3: Mixed language prompt (contains Chinese characters)
    >>> prompt_template = rewriter.choose_prompt("What is 机器学习?")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: True
    """
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if "\u4e00" <= ele <= "\u9fff":
                return ch_qustion_rewrite_prompt
        return en_qustion_rewrite_prompt

    def forward(self, *args, **kw):
        res = self._m(*args, **kw)
        if self.formatter == "list":
            return list(filter(None, res.split('\n')))
        else:
            return res

`choose_prompt(prompt)`

Choose the appropriate prompt template based on the language of the input prompt.

This method analyzes the input prompt string and determines whether to use the Chinese or English prompt template. It checks each character in the prompt string and if any character falls within the Chinese Unicode range (\u4e00-\u9fff), it returns the Chinese prompt template; otherwise, it returns the English prompt template.

Parameters:

prompt (str) –

The input prompt string to be analyzed for language detection.

Returns:

str: The selected prompt template string (either Chinese or English version).

Examples:

>>> from lazyllm.tools.actors.qustion_rewrite import QustionRewrite

# Example 1: English prompt (no Chinese characters)
>>> rewriter = QustionRewrite("gpt-3.5-turbo")
>>> prompt_template = rewriter.choose_prompt("How to implement machine learning?")
>>> print("Template contains Chinese:", "中文" in prompt_template)
Template contains Chinese: False

# Example 2: Chinese prompt (contains Chinese characters)
>>> prompt_template = rewriter.choose_prompt("如何实现机器学习？")
>>> print("Template contains Chinese:", "中文" in prompt_template)
Template contains Chinese: True

# Example 3: Mixed language prompt (contains Chinese characters)
>>> prompt_template = rewriter.choose_prompt("What is 机器学习?")
>>> print("Template contains Chinese:", "中文" in prompt_template)
Template contains Chinese: True

Source code in lazyllm/tools/actors/qustion_rewrite.py

    def choose_prompt(self, prompt: str):
        """
Choose the appropriate prompt template based on the language of the input prompt.

This method analyzes the input prompt string and determines whether to use the Chinese or English prompt template. It checks each character in the prompt string and if any character falls within the Chinese Unicode range (\\u4e00-\\u9fff), it returns the Chinese prompt template; otherwise, it returns the English prompt template.

Args:
    prompt (str): The input prompt string to be analyzed for language detection.

**Returns:**

- str: The selected prompt template string (either Chinese or English version).


Examples:

    >>> from lazyllm.tools.actors.qustion_rewrite import QustionRewrite

    # Example 1: English prompt (no Chinese characters)
    >>> rewriter = QustionRewrite("gpt-3.5-turbo")
    >>> prompt_template = rewriter.choose_prompt("How to implement machine learning?")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: False

    # Example 2: Chinese prompt (contains Chinese characters)
    >>> prompt_template = rewriter.choose_prompt("如何实现机器学习？")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: True

    # Example 3: Mixed language prompt (contains Chinese characters)
    >>> prompt_template = rewriter.choose_prompt("What is 机器学习?")
    >>> print("Template contains Chinese:", "中文" in prompt_template)
    Template contains Chinese: True
    """
        # Use chinese prompt if intent elements have chinese character, otherwise use english version
        for ele in prompt:
            # chinese unicode range
            if "\u4e00" <= ele <= "\u9fff":
                return ch_qustion_rewrite_prompt
        return en_qustion_rewrite_prompt

`lazyllm.tools.agent.toolsManager.ToolManager`

Bases: ModuleBase

ToolManager is a tool management class used to provide tool information and tool calls to function call.

When constructing this management class, you pass in a list of tools. Each element can be one of the following:

A tool name string (a tool registered via fc_register)
A callable (plain function or ModuleTool instance)
An object instance with __public_apis__ (automatically wrapped as InstanceToolGroup). If the class inherits from CredentialMixin, get_current_token() is automatically used as the credential source; when the token is empty the entire tool group is hidden from the LLM.
A (instance, key_source) tuple (instance tool group with a runtime credential source)
A dict: defines a tool group inline, with the format dict(name='grp', desc='...', tools=[...], lazy=True, pick_first_valid=False). name and tools are required fields. The optional key_source field binds a runtime credential source with the same semantics as the key_source in a (instance, key_source) tuple; when the credential is absent the entire group is hidden from the LLM. The optional pick_first_valid=True field enables pick-first-valid mode; see below.

Tool groups (ToolGroup) support three modes:

lazy mode (default): Initially only a get_<name>_methods gateway tool is exposed to the LLM. After the LLM calls it, the child tool descriptions are dynamically injected into the system prompt, and the LLM selects and calls the actual tool in the next turn. Suitable when there are many tools and you want to reduce context length.
eager mode (lazy=False): All child tool descriptions are expanded and injected into the system prompt immediately, matching the previous behavior.
pick-first-valid mode (pick_first_valid=True): Scans the child list and exposes only the first tool whose credential is currently valid. Designed for scenarios where multiple equivalent services act as fallbacks (e.g. multiple search engines). lazy is forced to False in this mode.

Tool groups support multi-level nesting; child nodes can be plain tools or another tool group (defined via a nested dict).

Parameters:

tools (List) –

Tool list. Each element can be a string, Callable, ModuleTool, an instance with __public_apis__, a (instance, key_source) tuple, or a dict tool group.
return_trace (bool, default: False ) –

If True, return intermediate steps and tool calls.
sandbox (LazyLLMSandboxBase | None, default: None ) –

A sandbox instance. When provided, tools with execute_in_sandbox set to True will be executed inside this sandbox, with automatic file upload/download handling.

Examples:

>>> from lazyllm.tools import ToolManager, fc_register
>>> import json
>>> from typing import Literal
>>> @fc_register("tool")
>>> def get_current_weather(location: str, unit: Literal["fahrenheit", "celsius"]="fahrenheit"):
...     '''
...     Get the current weather in a given location
...
...     Args:
...         location (str): The city and state, e.g. San Francisco, CA.
...         unit (str): The temperature unit to use. Infer this from the users location.
...     '''
...     if 'tokyo' in location.lower():
...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius'})
...     elif 'san francisco' in location.lower():
...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit'})
...     elif 'paris' in location.lower():
...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius'})
...     elif 'beijing' in location.lower():
...         return json.dumps({'location': 'Beijing', 'temperature': '90', 'unit': 'fahrenheit'})
...     else:
...         return json.dumps({'location': location, 'temperature': 'unknown'})
...
>>> @fc_register("tool")
>>> def get_n_day_weather_forecast(location: str, num_days: int, unit: Literal["celsius", "fahrenheit"]='fahrenheit'):
...     '''
...     Get an N-day weather forecast
...
...     Args:
...         location (str): The city and state, e.g. San Francisco, CA.
...         num_days (int): The number of days to forecast.
...         unit (Literal['celsius', 'fahrenheit']): The temperature unit to use. Infer this from the users location.
...     '''
...     if 'tokyo' in location.lower():
...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius', "num_days": num_days})
...     elif 'san francisco' in location.lower():
...         return json.dumps({'location': 'San Francisco', 'temperature': '75', 'unit': 'fahrenheit', "num_days": num_days})
...     elif 'paris' in location.lower():
...         return json.dumps({'location': 'Paris', 'temperature': '25', 'unit': 'celsius', "num_days": num_days})
...     elif 'beijing' in location.lower():
...         return json.dumps({'location': 'Beijing', 'temperature': '85', 'unit': 'fahrenheit', "num_days": num_days})
...     else:
...         return json.dumps({'location': location, 'temperature': 'unknown'})
...
>>> tools = ["get_current_weather", "get_n_day_weather_forecast"]
>>> tm = ToolManager(tools)
>>> print(tm([{'name': 'get_n_day_weather_forecast', 'arguments': {'location': 'Beijing', 'num_days': 3}}])[0])
'{"location": "Beijing", "temperature": "85", "unit": "fahrenheit", "num_days": 3}'

>>> # Using dict to define a lazy tool group (reduces initial context length)
>>> def search_web(query: str) -> str:
...     '''Search the web.
...
...     Args:
...         query (str): Search query.
...
...     Returns:
...         str: Search results.
...     '''
...     return f'results for {query}'
...
>>> def search_news(query: str) -> str:
...     '''Search news articles.
...
...     Args:
...         query (str): News search query.
...
...     Returns:
...         str: News results.
...     '''
...     return f'news for {query}'
...
>>> tm2 = ToolManager([
...     dict(name='search', desc='Web and news search tools', tools=[search_web, search_news]),
... ])
>>> # Initially only the gateway tool is visible
>>> [d['function']['name'] for d in tm2.tools_description]
['get_search_methods']

Source code in lazyllm/tools/agent/toolsManager.py

class ToolManager(ModuleBase):
    """ToolManager is a tool management class used to provide tool information and tool calls to function call.

When constructing this management class, you pass in a list of tools. Each element can be one of the following:

- A tool name string (a tool registered via ``fc_register``)
- A callable (plain function or ``ModuleTool`` instance)
- An object instance with ``__public_apis__`` (automatically wrapped as ``InstanceToolGroup``).
  If the class inherits from ``CredentialMixin``, ``get_current_token()`` is automatically used as the credential source;
  when the token is empty the entire tool group is hidden from the LLM.
- A ``(instance, key_source)`` tuple (instance tool group with a runtime credential source)
- A ``dict``: defines a tool group inline, with the format ``dict(name='grp', desc='...', tools=[...], lazy=True, pick_first_valid=False)``. ``name`` and ``tools`` are required fields.
  The optional ``key_source`` field binds a runtime credential source with the same semantics as the ``key_source`` in a ``(instance, key_source)`` tuple;
  when the credential is absent the entire group is hidden from the LLM.
  The optional ``pick_first_valid=True`` field enables pick-first-valid mode; see below.

Tool groups (``ToolGroup``) support three modes:

- **lazy mode** (default): Initially only a ``get_<name>_methods`` gateway tool is exposed to the LLM. After the LLM calls it, the child tool descriptions are dynamically injected into the system prompt, and the LLM selects and calls the actual tool in the next turn. Suitable when there are many tools and you want to reduce context length.
- **eager mode** (``lazy=False``): All child tool descriptions are expanded and injected into the system prompt immediately, matching the previous behavior.
- **pick-first-valid mode** (``pick_first_valid=True``): Scans the child list and exposes only the first tool whose credential is currently valid. Designed for scenarios where multiple equivalent services act as fallbacks (e.g. multiple search engines). ``lazy`` is forced to ``False`` in this mode.

Tool groups support multi-level nesting; child nodes can be plain tools or another tool group (defined via a nested ``dict``).

Args:
    tools (List): Tool list. Each element can be a string, Callable, ModuleTool, an instance with ``__public_apis__``, a ``(instance, key_source)`` tuple, or a ``dict`` tool group.
    return_trace (bool): If True, return intermediate steps and tool calls.
    sandbox (LazyLLMSandboxBase | None): A sandbox instance. When provided, tools with ``execute_in_sandbox`` set to True will be executed inside this sandbox, with automatic file upload/download handling.



Examples:
    >>> from lazyllm.tools import ToolManager, fc_register
    >>> import json
    >>> from typing import Literal
    >>> @fc_register("tool")
    >>> def get_current_weather(location: str, unit: Literal["fahrenheit", "celsius"]="fahrenheit"):
    ...     '''
    ...     Get the current weather in a given location
    ...
    ...     Args:
    ...         location (str): The city and state, e.g. San Francisco, CA.
    ...         unit (str): The temperature unit to use. Infer this from the users location.
    ...     '''
    ...     if 'tokyo' in location.lower():
    ...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius'})
    ...     elif 'san francisco' in location.lower():
    ...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit'})
    ...     elif 'paris' in location.lower():
    ...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius'})
    ...     elif 'beijing' in location.lower():
    ...         return json.dumps({'location': 'Beijing', 'temperature': '90', 'unit': 'fahrenheit'})
    ...     else:
    ...         return json.dumps({'location': location, 'temperature': 'unknown'})
    ...
    >>> @fc_register("tool")
    >>> def get_n_day_weather_forecast(location: str, num_days: int, unit: Literal["celsius", "fahrenheit"]='fahrenheit'):
    ...     '''
    ...     Get an N-day weather forecast
    ...
    ...     Args:
    ...         location (str): The city and state, e.g. San Francisco, CA.
    ...         num_days (int): The number of days to forecast.
    ...         unit (Literal['celsius', 'fahrenheit']): The temperature unit to use. Infer this from the users location.
    ...     '''
    ...     if 'tokyo' in location.lower():
    ...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius', "num_days": num_days})
    ...     elif 'san francisco' in location.lower():
    ...         return json.dumps({'location': 'San Francisco', 'temperature': '75', 'unit': 'fahrenheit', "num_days": num_days})
    ...     elif 'paris' in location.lower():
    ...         return json.dumps({'location': 'Paris', 'temperature': '25', 'unit': 'celsius', "num_days": num_days})
    ...     elif 'beijing' in location.lower():
    ...         return json.dumps({'location': 'Beijing', 'temperature': '85', 'unit': 'fahrenheit', "num_days": num_days})
    ...     else:
    ...         return json.dumps({'location': location, 'temperature': 'unknown'})
    ...
    >>> tools = ["get_current_weather", "get_n_day_weather_forecast"]
    >>> tm = ToolManager(tools)
    >>> print(tm([{'name': 'get_n_day_weather_forecast', 'arguments': {'location': 'Beijing', 'num_days': 3}}])[0])
    '{"location": "Beijing", "temperature": "85", "unit": "fahrenheit", "num_days": 3}'

    >>> # Using dict to define a lazy tool group (reduces initial context length)
    >>> def search_web(query: str) -> str:
    ...     '''Search the web.
    ...
    ...     Args:
    ...         query (str): Search query.
    ...
    ...     Returns:
    ...         str: Search results.
    ...     '''
    ...     return f'results for {query}'
    ...
    >>> def search_news(query: str) -> str:
    ...     '''Search news articles.
    ...
    ...     Args:
    ...         query (str): News search query.
    ...
    ...     Returns:
    ...         str: News results.
    ...     '''
    ...     return f'news for {query}'
    ...
    >>> tm2 = ToolManager([
    ...     dict(name='search', desc='Web and news search tools', tools=[search_web, search_news]),
    ... ])
    >>> # Initially only the gateway tool is visible
    >>> [d['function']['name'] for d in tm2.tools_description]
    ['get_search_methods']
    """
    def __init__(self, tools: List[Union[str, Callable]], return_trace: bool = False, sandbox=None):
        super().__init__(return_trace=return_trace)
        self._tools = [_build_tool_from_element(element) for element in tools]
        self._format_tools()
        self._tools_desc = self._transform_to_openai_function()
        self._sandbox = sandbox

    @property
    def all_tools(self) -> List[ModuleTool]:
        return list(self._tool_call.values())

    @property
    def tools_description(self) -> List[Dict]:
        try:
            workspace = lazyllm_locals['_lazyllm_agent'].get('workspace', {})
        except Exception:
            workspace = {}
        active_groups = set(workspace.get('_active_groups', []))
        return [x for item in self._tools_desc
                for x in (item.get_description(active_groups=active_groups)
                          if isinstance(item, ToolContainer) else item() if callable(item) else [item])]

    @property
    def tools_info(self):
        return self._tool_call

    @property
    def sandbox(self):
        return self._sandbox

    @sandbox.setter
    def sandbox(self, sandbox):
        self._sandbox = sandbox

    def _validate_tool(self, tool_name: str, tool_arguments: Dict[str, Any]):
        entry = self._tool_call.get(tool_name)
        if not entry:
            LOG.error(f'cannot find tool named [{tool_name}]')
            return False
        return entry.validate_parameters(tool_arguments)

    def _format_tools(self):
        if isinstance(self._tools, List):
            self._tool_call: Dict[str, ModuleTool] = {}
            for item in self._tools:
                items = item.get_flat_tools() if isinstance(item, ToolContainer) else {item.name: item}
                for name, tool in items.items():
                    if name in self._tool_call:
                        raise ValueError(f'Duplicate tool name [{name}]. Tool names must be unique.')
                    self._tool_call[name] = tool

    def _transform_to_openai_function(self):
        if not isinstance(self._tools, List):
            raise TypeError(f'The tools type should be List instead of {type(self._tools)}')

        format_tools = []
        for item in self._tools:
            if isinstance(item, ToolContainer):
                format_tools.append(item)
            else:
                try:
                    format_tools.append(_build_tool_desc(item))
                except Exception:
                    self._raise_format_error()
        return format_tools

    @staticmethod
    def _raise_format_error():
        typehints_template = '''
                def myfunc(arg1: str, arg2: Dict[str, Any], arg3: Literal['aaa', 'bbb', 'ccc']='aaa'):
                    """
                    Function description ...

                    Args:
                        arg1 (str): arg1 description.
                        arg2 (Dict[str, Any]): arg2 description
                        arg3 (Literal['aaa', 'bbb', 'ccc']): arg3 description
                    """
                '''
        raise TypeError('Function description must include function description and '
                        f'parameter description, the format is as follows: {typehints_template}')

    @staticmethod
    def _ensure_list(value):
        if isinstance(value, str):
            return [value]
        return value if value else []

    def _build_sandbox_args(self, tool, arguments):
        input_files = self._ensure_list(arguments.get(tool.input_files_parm, []))
        output_files = self._ensure_list(arguments.get(tool.output_files_parm, [])) + tool.output_files
        return kwargs(code=tool.to_sandbox_code(arguments), input_files=input_files, output_files=output_files)

    def _parse_tool_call(self, tc):
        func = tc.get('function') if isinstance(tc, dict) else None
        if not func or 'name' not in func or 'arguments' not in func:
            return None, f'Tool call format is invalid, expected: {TOOL_CALL_FORMAT_EXAMPLE}'
        name = func['name']
        raw_args = func['arguments']
        arguments = json.loads(raw_args) if isinstance(raw_args, str) else raw_args
        if not isinstance(arguments, dict):
            return None, f'Tool [{name}] arguments format error.'
        tool = self._tool_call.get(name)
        if tool is None:
            return None, f'Tool [{name}] is not available. Please choose from the available tools.'
        if not self._validate_tool(name, arguments):
            return None, f'Tool [{name}] parameters error.'
        return tool, arguments

    def forward(self, tools: Union[Dict[str, Any], List[Dict[str, Any]]], verbose: bool = False):
        if not tools: return []
        tool_calls = [tools] if isinstance(tools, dict) else tools

        callables = []
        call_arguments = []
        for tc in tool_calls:
            tool, args_or_err = self._parse_tool_call(tc)
            if tool is None:
                callables.append(lambda *_, _e=args_or_err: _e)
                call_arguments.append({})
            elif self._sandbox and tool.execute_in_sandbox:
                callables.append(self._sandbox)
                call_arguments.append(self._build_sandbox_args(tool, args_or_err))
            else:
                def _safe_call(args, _tool=tool):
                    try:
                        return _tool(args)
                    except Exception as e:
                        lazyllm.LOG.warning(f'Tool {_tool.name} raised an exception: {e}')
                        return f'[Tool Error] {type(e).__name__}: {e}'
                callables.append(_safe_call)
                call_arguments.append(args_or_err)

        tool_diverter = lazyllm.diverter(tuple(callables))
        return tool_diverter(tuple(call_arguments))

`lazyllm.tools.ModuleTool`

Bases: ModuleBase

Base class for defining tools using callable Python functions.

This class automatically parses function signatures and docstrings to build a parameter schema using pydantic. It also performs input validation and handles standardized tool execution.

__init__(self, verbose=False, return_trace=True, execute_in_sandbox=True) Initializes a tool wrapper module.

Parameters:

verbose (bool, default: False ) –

Whether to print verbose logs during execution.
return_trace (bool, default: False ) –

Whether to keep intermediate execution trace in the result.
execute_in_sandbox (bool, default: True ) –

Whether to execute in sandbox, default True. When ToolManager has a sandbox configured and this is True, the tool will be executed inside the sandbox.

Examples:

>>> from lazyllm.components import ModuleTool
>>> class AddTool(ModuleTool):
...     def apply(self, a: int, b: int) -> int:
...         '''Add two integers.
...
...         Args:
...             a (int): First number.
...             b (int): Second number.
...
...         Returns:
...             int: The sum of a and b.
...         '''
...         return a + b
>>> tool = AddTool()
>>> result = tool({'a': 3, 'b': 5})
>>> print(result)
8

Source code in lazyllm/tools/agent/toolsManager.py

class ModuleTool(ModuleBase, metaclass=LazyLLMRegisterMetaClass):
    """Base class for defining tools using callable Python functions.

This class automatically parses function signatures and docstrings to build a parameter schema using `pydantic`. It also performs input validation and handles standardized tool execution.

`__init__(self, verbose=False, return_trace=True, execute_in_sandbox=True)`
Initializes a tool wrapper module.

Args:
    verbose (bool): Whether to print verbose logs during execution.
    return_trace (bool): Whether to keep intermediate execution trace in the result.
    execute_in_sandbox (bool): Whether to execute in sandbox, default True. When ToolManager has a sandbox configured and this is True, the tool will be executed inside the sandbox.


Examples:

    >>> from lazyllm.components import ModuleTool
    >>> class AddTool(ModuleTool):
    ...     def apply(self, a: int, b: int) -> int:
    ...         '''Add two integers.
    ...
    ...         Args:
    ...             a (int): First number.
    ...             b (int): Second number.
    ...
    ...         Returns:
    ...             int: The sum of a and b.
    ...         '''
    ...         return a + b
    >>> tool = AddTool()
    >>> result = tool({'a': 3, 'b': 5})
    >>> print(result)
    8
    """
    def __init__(self, verbose: bool = False, return_trace: bool = False, execute_in_sandbox: bool = True,
                 apply_func: Optional[Callable] = None, schema_func: Optional[Callable] = None):
        super().__init__(return_trace=return_trace)
        if apply_func is not None:
            self.apply = apply_func
        self._verbose = verbose
        func = getattr(self.apply, '__func__', self.apply)
        if callable(func) and getattr(func, '__closure__', None):
            self._name = getattr(func, '__name__', None)
        else:
            self._name = self.__class__.__name__
        self._description = self.apply.__doc__\
            if hasattr(self.apply, '__doc__') and self.apply.__doc__ is not None\
            else (_ for _ in ()).throw(ValueError('Function must have a docstring'))
        # strip space(s) and newlines before and after docstring, as RewooAgent requires
        self._description = self._description.strip(' \n')
        self._execute_in_sandbox = execute_in_sandbox
        self._input_files_parm = None
        self._output_files_parm = None
        self._output_files = []

        self._params_schema = self._load_function_schema(schema_func or self.__class__.apply)

    @staticmethod
    def _safe_eval_type(type_str: str, context: str) -> Any:
        try:
            tree = ast.parse(type_str, mode='eval')
            SecurityVisitor().visit(tree)
        except ValueError as e:
            raise ValueError(f'Unsafe type expression in docstring ({context}): {e}')
        try:
            return eval(type_str, globals())  # noqa S307
        except Exception:
            raise NameError(f'Unknown type "{type_str}" in docstring ({context}).')

    @staticmethod
    def _parse_type_from_docstring(parsed_docstring) -> Dict[str, Any]:
        hints: Dict[str, Any] = {}
        for param in parsed_docstring.params:
            if not param.type_name:
                continue
            hints[param.arg_name] = ModuleTool._safe_eval_type(
                param.type_name, f'parameter "{param.arg_name}"')
        if parsed_docstring.returns and parsed_docstring.returns.type_name:
            hints['return'] = ModuleTool._safe_eval_type(
                parsed_docstring.returns.type_name, 'return type')
        return hints

    def _load_function_schema(self, func: Callable) -> Type[BaseModel]:
        parsed_docstring = docstring_parser.parse(self._description)
        doc_type_hints = self._parse_type_from_docstring(parsed_docstring)

        func_type_hints = get_type_hints(func, globals(), locals())

        func_return = func_type_hints.get('return')
        doc_return = doc_type_hints.get('return')
        if func_return is not None and doc_return is not None and func_return != doc_return:
            raise TypeError(
                f'return info in docstring ({doc_return}) is different from '
                f'function prototype ({func_return}) in {func.__name__}'
            )

        signature = inspect.signature(func)
        has_var_args = any(
            p.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD)
            for p in signature.parameters.values())

        if has_var_args:
            self._type_hints = doc_type_hints
            param_names = [p.arg_name for p in parsed_docstring.params]
            fields = {name: (doc_type_hints.get(name, Any), ...) for name in param_names}
        else:
            self._type_hints = func_type_hints
            for name, t in doc_type_hints.items(): self._type_hints.setdefault(name, t)
            fields = {
                name: (self._type_hints.get(name, Any), param.default
                       if param.default is not inspect.Parameter.empty else ...)
                for name, param in signature.parameters.items() if name != 'self'}

        self._return_type = self._type_hints.get('return') if self._type_hints else None
        return create_model(self._name, **fields)

    @property
    def name(self):
        return self._name

    @property
    def description(self):
        return self._description

    @property
    def execute_in_sandbox(self) -> bool:
        return self._execute_in_sandbox

    @execute_in_sandbox.setter
    def execute_in_sandbox(self, value: bool):
        self._execute_in_sandbox = value

    @property
    def input_files_parm(self) -> str:
        return self._input_files_parm

    @input_files_parm.setter
    def input_files_parm(self, value: str):
        assert isinstance(value, str), f'input_files_parm must be a string, but got {type(value)}'
        self._input_files_parm = value

    @property
    def output_files_parm(self) -> str:
        return self._output_files_parm

    @output_files_parm.setter
    def output_files_parm(self, value: str):
        assert isinstance(value, str), f'output_files_parm must be a string, but got {type(value)}'
        self._output_files_parm = value

    @property
    def output_files(self) -> List[str]:
        return self._output_files

    @output_files.setter
    def output_files(self, value: List[str]):
        assert isinstance(value, list) and all(isinstance(item, str) for item in value), \
            f'output_files must be a list of strings, but got {type(value)}'
        self._output_files = value

    @property
    def params_schema(self) -> Type[BaseModel]:
        return self._params_schema

    @property
    def args(self) -> Dict[str, Any]:
        return self._params_schema.model_json_schema()['properties']

    @property
    def required_args(self) -> Set[str]:
        return set(self._params_schema.model_json_schema().get('required', []))

    def apply(self, *args: Any, **kwargs: Any) -> Any:
        """
Concrete implementation method of the tool function.

This is an abstract method that needs to be implemented in subclasses to provide the core functionality of the tool.

Args:
    *args (Any): Positional arguments
    **kwargs (Any): Keyword arguments

**Returns:**

- Result of tool execution

**Raises:**

    NotImplementedError: If the method is not overridden in a subclass.
"""
        raise NotImplementedError('Implement apply function in subclass')

    @staticmethod
    def _rebuild_from_reduce(module_id, rebuild_object):
        if isinstance(rebuild_object, type):
            return rebuild_object()._set_mid(module_id)
        elif callable(rebuild_object):
            register('tool')(rebuild_object)
            cls_to_use = getattr(lazyllm.tool, rebuild_object.__name__)
            return cls_to_use()._set_mid(module_id)
        else:
            raise ValueError(f'Invalid rebuild object in ModuleTool: {rebuild_object}')

    @staticmethod
    def _get_orig_apply_func(apply_method):
        func = getattr(apply_method, '__func__', apply_method)
        if not callable(func) or not getattr(func, '__closure__', None):
            return None
        for cell in func.__closure__:
            try:
                c = cell.cell_contents
                if callable(c) and not isinstance(c, type) and getattr(c, '__name__', None):
                    return c
            except ValueError:
                pass
        return None

    def __reduce__(self):
        if os.getenv('LAZYLLM_ON_CLOUDPICKLE', None) == 'ON':
            orig = ModuleTool._get_orig_apply_func(self.apply)
            if orig is not None and orig.__module__ != '__main__':
                import types
                orig = types.FunctionType(
                    orig.__code__, orig.__globals__, orig.__name__,
                    orig.__defaults__, orig.__closure__)
                orig.__module__ = '__main__'
            return (ModuleTool._rebuild_from_reduce, (self._module_id, orig or self.__class__))
        return super().__reduce__()

    def _validate_input(self, tool_input: Union[Dict[str, Any], str]) -> Dict[str, Any]:
        input_params = self._params_schema
        if isinstance(tool_input, dict):
            if input_params is not None:
                ret = input_params.model_validate(tool_input)
                return {key: getattr(ret, key) for key in ret.model_dump().keys() if key in tool_input}
            return tool_input
        elif isinstance(tool_input, str):
            if input_params is not None:
                key = next(iter(input_params.model_fields.keys()))
                input_params.model_validate({key: tool_input})
                arg_type = self._type_hints.get(key)
                if arg_type:
                    return {key: arg_type(tool_input)}
                return {key: tool_input}
            key = next(iter(self._type_hints.keys()))
            return {key: self._type_hints[key](tool_input)}
        else:
            raise TypeError(f'tool_input {tool_input} only supports dict and str.')

    def validate_parameters(self, arguments: Dict[str, Any]) -> bool:
        """
Validate whether the provided arguments meet the required criteria.

This method checks if all required keys are present in the input dictionary and attempts format validation.

Args:
    arguments (Dict[str, Any]): Dictionary of input arguments.

**Returns:**

- bool: True if valid and complete; False otherwise.
"""
        if len(self.required_args.difference(set(arguments.keys()))) == 0:
            # contains all required parameters
            try:
                self._validate_input(arguments)
                return True
            except ValidationError:
                return False
        return False

    def forward(self, tool_input: Union[str, Dict[str, Any]], verbose: bool = False) -> Any:
        val_input = self._validate_input(tool_input)
        if isinstance(val_input, dict):
            ret = self.apply(**val_input)
        else:
            ret = self.apply(val_input)
        if verbose or self._verbose:
            lazyllm.LOG.debug(f'The output of tool {self.name} is {ret}')

        return ret

    def to_sandbox_code(self, tool_arguments: Dict[str, Any]) -> str:
        """
Generate a sandbox-executable code string.

This method serializes the tool instance and arguments, and returns a Python code snippet
that can be deserialized and executed inside a sandbox environment.

Args:
    tool_arguments (Dict[str, Any]): Tool arguments as a dict.

**Returns:**

- str: A Python code string executable in a sandbox environment.
"""
        from lazyllm.tools.sandbox.sandbox_base import SANDBOX_TOOL_RESULT_PREFIX
        args_dump = lazyllm.dump_obj(tool_arguments)
        tool_dump = lazyllm.dump_obj(self)
        return f'''
import base64
import cloudpickle
tool = cloudpickle.loads(base64.b64decode({repr(tool_dump)}.encode('utf-8')))
kwargs = cloudpickle.loads(base64.b64decode({repr(args_dump)}.encode('utf-8')))
result = tool(kwargs)
print(f'{SANDBOX_TOOL_RESULT_PREFIX}{{result}}')  # noqa print
'''

`apply(*args, **kwargs)`

Concrete implementation method of the tool function.

This is an abstract method that needs to be implemented in subclasses to provide the core functionality of the tool.

Parameters:

*args (Any, default: () ) –

Positional arguments
**kwargs (Any, default: {} ) –

Keyword arguments

Returns:

Result of tool execution

Raises:

NotImplementedError: If the method is not overridden in a subclass.

Source code in lazyllm/tools/agent/toolsManager.py

    def apply(self, *args: Any, **kwargs: Any) -> Any:
        """
Concrete implementation method of the tool function.

This is an abstract method that needs to be implemented in subclasses to provide the core functionality of the tool.

Args:
    *args (Any): Positional arguments
    **kwargs (Any): Keyword arguments

**Returns:**

- Result of tool execution

**Raises:**

    NotImplementedError: If the method is not overridden in a subclass.
"""
        raise NotImplementedError('Implement apply function in subclass')

`to_sandbox_code(tool_arguments)`

Generate a sandbox-executable code string.

This method serializes the tool instance and arguments, and returns a Python code snippet that can be deserialized and executed inside a sandbox environment.

Parameters:

tool_arguments (Dict[str, Any]) –

Tool arguments as a dict.

Returns:

str: A Python code string executable in a sandbox environment.

Source code in lazyllm/tools/agent/toolsManager.py

    def to_sandbox_code(self, tool_arguments: Dict[str, Any]) -> str:
        """
Generate a sandbox-executable code string.

This method serializes the tool instance and arguments, and returns a Python code snippet
that can be deserialized and executed inside a sandbox environment.

Args:
    tool_arguments (Dict[str, Any]): Tool arguments as a dict.

**Returns:**

- str: A Python code string executable in a sandbox environment.
"""
        from lazyllm.tools.sandbox.sandbox_base import SANDBOX_TOOL_RESULT_PREFIX
        args_dump = lazyllm.dump_obj(tool_arguments)
        tool_dump = lazyllm.dump_obj(self)
        return f'''
import base64
import cloudpickle
tool = cloudpickle.loads(base64.b64decode({repr(tool_dump)}.encode('utf-8')))
kwargs = cloudpickle.loads(base64.b64decode({repr(args_dump)}.encode('utf-8')))
result = tool(kwargs)
print(f'{SANDBOX_TOOL_RESULT_PREFIX}{{result}}')  # noqa print
'''

`validate_parameters(arguments)`

Validate whether the provided arguments meet the required criteria.

This method checks if all required keys are present in the input dictionary and attempts format validation.

Parameters:

arguments (Dict[str, Any]) –

Dictionary of input arguments.

Returns:

bool: True if valid and complete; False otherwise.

Source code in lazyllm/tools/agent/toolsManager.py

    def validate_parameters(self, arguments: Dict[str, Any]) -> bool:
        """
Validate whether the provided arguments meet the required criteria.

This method checks if all required keys are present in the input dictionary and attempts format validation.

Args:
    arguments (Dict[str, Any]): Dictionary of input arguments.

**Returns:**

- bool: True if valid and complete; False otherwise.
"""
        if len(self.required_args.difference(set(arguments.keys()))) == 0:
            # contains all required parameters
            try:
                self._validate_input(arguments)
                return True
            except ValidationError:
                return False
        return False

`lazyllm.tools.LazyLLMAgentBase`

Bases: ModuleBase

LazyLLMAgentBase is the common base class for built-in agents. It unifies tool management, skills enablement, system-prompt injection, and execution flow.

Parameters:

llm –

Large language model instance.
tools (List[str], default: None ) –

List of tool names.
max_retries (int, default: 5 ) –

Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. Default is 5.
return_trace (bool, default: False ) –

Whether to return execution traces.
stream (bool, default: False ) –

Whether to enable streaming output.
return_last_tool_calls (bool, default: False ) –

If True, return the last tool-call trace when the model finishes.
skills (bool | str | List[str], default: None ) –

Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
memory –

Reserved memory/context object.
desc (str, default: '' ) –

Optional agent capability description.
workspace (str, default: None ) –

Default agent workspace path. Defaults to config['home']/agent_workspace.

Source code in lazyllm/tools/agent/base.py

class LazyLLMAgentBase(ModuleBase):
    """LazyLLMAgentBase is the common base class for built-in agents. It unifies tool management, skills enablement,
system-prompt injection, and execution flow.

Args:
    llm: Large language model instance.
    tools (List[str]): List of tool names.
    max_retries (int): Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. Default is 5.
    return_trace (bool): Whether to return execution traces.
    stream (bool): Whether to enable streaming output.
    return_last_tool_calls (bool): If True, return the last tool-call trace when the model finishes.
    skills (bool | str | List[str]): Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
    memory: Reserved memory/context object.
    desc (str): Optional agent capability description.
    workspace (str): Default agent workspace path. Defaults to `config['home']/agent_workspace`.
"""
    def __init__(self, llm=None, tools: Optional[List[Union[str, Callable, Dict]]] = None,
                 max_retries: int = 5, return_trace: bool = False,
                 stream: bool = False, return_last_tool_calls: bool = False,
                 skills: Optional[Union[bool, str, Iterable[str]]] = None, memory=None,
                 desc: str = '', workspace: Optional[str] = None,
                 sandbox: Union[str, LazyLLMSandboxBase, None] = 'auto',
                 fs: Optional[Any] = None, skills_dir: Optional[str] = None,
                 enable_builtin_tools: bool = True):
        super().__init__(return_trace=return_trace)
        use_skills, skills = self._normalize_skills_config(skills)
        if not use_skills and (fs is not None or skills_dir is not None):
            import warnings
            warnings.warn(
                'fs and skills_dir are ignored because skills is not enabled. '
                'Pass skills=True (or a list of skill names) to enable skill loading.',
                UserWarning, stacklevel=2,
            )
        self._llm = llm
        self._tools = list(tools) if tools else []
        self._memory = memory
        self._skills = skills or []
        self._desc = desc
        self._max_retries = max_retries
        self._stream = stream
        self._return_last_tool_calls = return_last_tool_calls
        self._workspace = self._init_workspace(workspace)
        self._agent = None
        self._skill_manager = None
        self._sandbox = create_sandbox() if sandbox == 'auto' else sandbox
        self._builtin_tool_names = set()
        self._skill_tool_names = set()
        self._enable_builtin_tools = enable_builtin_tools

        if self._enable_builtin_tools:
            self._ensure_builtin_tools()
        if use_skills:
            self._skill_manager = SkillManager(dir=skills_dir, skills=self._skills, fs=fs)
            self._ensure_default_skill_tools()
        self._tools_manager = ToolManager(self._tools, return_trace=return_trace, sandbox=self._sandbox)

    @staticmethod
    def _normalize_skills_config(skills: Optional[Union[bool, str, Iterable[str]]]):
        if isinstance(skills, bool):
            return skills, []
        if skills is None:
            return False, []
        if isinstance(skills, str):
            if not skills.strip():
                return False, []
            return True, [skills]
        if isinstance(skills, (list, tuple, set)):
            normalized = [item for item in skills if item]
            if not normalized:
                return False, []
            return True, normalized
        raise TypeError('skills must be a bool, str, or list of skill names.')

    def _init_workspace(self, workspace: Optional[str]) -> str:
        path = workspace or os.path.join(lazyllm.config['home'], 'agent_workspace')
        path = os.path.abspath(os.path.expanduser(path))
        os.makedirs(path, exist_ok=True)
        return path

    def _pre_process(self, *args, **kwargs):
        if len(args) == 1 and not kwargs:
            return args[0]
        return args if args else kwargs

    def _post_process(self, result):
        return result

    @once_wrapper(reset_on_pickle=True)
    def build_agent(self):
        raise NotImplementedError('Subclasses must implement build_agent().')

    def forward(self, *args, **kwargs):
        self.build_agent()
        if self._agent is None:
            raise RuntimeError('build_agent() did not initialize _agent.')
        pre = self._pre_process(*args, **kwargs)
        if isinstance(pre, tuple):
            result = self._agent(*pre)
        elif isinstance(pre, dict):
            result = self._agent(**pre)
        else:
            result = self._agent(pre)
        return self._post_process(result)

    @staticmethod
    def _normalize_tool_results(tool_calls, tool_calls_results):
        return [{
            'id': tool_call.get('id'),
            'name': tool_call.get('function', {}).get('name'),
            'arguments': tool_call.get('function', {}).get('arguments'),
            'result': tool_result,
        } for tool_call, tool_result in zip(tool_calls, tool_calls_results)]

    def _assert_tools(self):
        assert self._tools, 'tools cannot be empty.'

    def _ensure_builtin_tools(self):
        builtin_keys = []
        builtin_group = getattr(lazyllm, 'builtin_tools', None)
        if builtin_group:
            builtin_keys = list(builtin_group.keys())
        self._builtin_tool_names = {
            key[:-len('builtin_tools')] if key.endswith('builtin_tools') else key
            for key in builtin_keys
        }
        existing = set()
        for tool in self._tools:
            if isinstance(tool, str):
                existing.add(tool.split('.')[-1])
            elif hasattr(tool, '__name__'):
                existing.add(tool.__name__)
        for key in builtin_keys:
            name = key[:-len('builtin_tools')] if key.endswith('builtin_tools') else key
            if name not in existing:
                self._tools.append(f'builtin_tools.{key}')

    def _ensure_default_skill_tools(self):
        if self._skill_manager:
            for tool in self._skill_manager.get_skill_tools():
                self._skill_tool_names.add(tool.__name__)
                self._tools.append(tool)

    def _append_workspace_prompt(self, prompt: str) -> str:
        if not self._enable_builtin_tools:
            return prompt
        return (
            f'{prompt}\n\n## Workspace\n'
            f'- Default workspace: `{self._workspace}`\n'
            '- Prefer creating and updating files under this workspace.\n'
            '- Use absolute paths under this workspace when possible.\n'
        )

    @property
    def workspace(self) -> str:
        return self._workspace

    @property
    def sandbox(self) -> Optional[LazyLLMSandboxBase]:
        return self._sandbox

    @sandbox.setter
    def sandbox(self, sandbox: Optional[LazyLLMSandboxBase]):
        self._sandbox = sandbox
        if hasattr(self, '_tools_manager') and self._tools_manager is not None:
            self._tools_manager.sandbox = sandbox

    @property
    def desc(self) -> str:
        return self._desc

    @desc.setter
    def desc(self, desc: str):
        self._desc = desc

    def _pop_tool_calls(self, key: str = 'tool_call_trace'):
        if not self._return_last_tool_calls:
            return None
        if locals['_lazyllm_agent'].get('workspace', {}).get(key):
            return locals['_lazyllm_agent'].pop('workspace').pop(key)
        if locals['_lazyllm_agent'].get('completed'):
            return locals['_lazyllm_agent'].pop('completed')
        return None

`lazyllm.tools.SkillManager`

Bases: ModuleBase

SkillManager discovers, loads, and manages Skills.

Parameters:

dir (str, default: None ) –

Skills directory paths, comma-separated is supported.
skills (Iterable[str], default: None ) –

Expected skill name list.
max_skill_md_bytes (int, default: None ) –

Maximum SKILL.md size to load.
llm –

Reserved parameter, not required currently.

Source code in lazyllm/tools/agent/skill_manager.py

class SkillManager(ModuleBase):
    """SkillManager discovers, loads, and manages Skills.

Args:
    dir (str, optional): Skills directory paths, comma-separated is supported.
    skills (Iterable[str], optional): Expected skill name list.
    max_skill_md_bytes (int, optional): Maximum SKILL.md size to load.
    llm: Reserved parameter, not required currently.
"""
    def __init__(self, dir: Optional[str] = None, skills: Optional[Iterable[str]] = None,
                 max_skill_md_bytes: Optional[int] = None, fs=None):
        super().__init__(return_trace=False)
        self._fs = fs or fsspec.implementations.local.LocalFileSystem()
        self._skills_dir = self._parse_dirs(dir or config['skills_dir'])
        self._validate_fs_dir_consistency(fs, self._skills_dir)
        self._skills_expected = self._parse_skills(skills)
        self._max_skill_md_bytes = max_skill_md_bytes or config['max_skill_md_bytes']
        self._skills_index: Dict[str, Dict] = {}
        self._skills_selected: List[str] = []
        self._skills_index_lock = threading.Lock()

    @staticmethod
    def _extract_protocol(path: str) -> Optional[str]:
        m = re.match(r'^([a-zA-Z][a-zA-Z0-9+\-.]*)(@[^:/]+)?:/', path)
        return m.group(1).lower() if m else None

    @staticmethod
    def _validate_fs_dir_consistency(fs, dirs: List[str]) -> None:
        from lazyllm.tools.fs.client import _FSRouter
        if fs is None or isinstance(fs, _FSRouter):
            return
        fs_protocol = getattr(fs, '_fs_protocol_key', None)
        for path in dirs:
            path_protocol = SkillManager._extract_protocol(path)
            if fs_protocol:
                # Known FS: path without protocol is fine (treated as this FS's protocol);
                # path with a different protocol is an error.
                if path_protocol is not None and path_protocol != fs_protocol:
                    raise ValueError(
                        f'dir protocol {path_protocol!r} does not match fs protocol {fs_protocol!r}. '
                        f'Use \'{fs_protocol}:/your/path\' or a bare path, or pass the matching FS instance.'
                    )
            else:
                # Unknown third-party FS: cannot validate a protocol prefix, so reject it.
                if path_protocol is not None:
                    raise ValueError(
                        f'dir {path!r} has a protocol prefix {path_protocol!r}, but the provided fs '
                        f'{type(fs).__name__!r} has no _fs_protocol_key. '
                        f'Use a bare path (without protocol prefix) for this FS.'
                    )

    @staticmethod
    def _parse_dirs(dir_value: Optional[str]) -> List[str]:
        if not dir_value:
            return []
        dirs = [d.strip() for d in dir_value.split(',') if d.strip()] if isinstance(dir_value, str) else list(dir_value)
        seen = set()
        result = []
        for d in dirs:
            if not d:
                continue
            # Keep cloud paths (protocol:/ prefix) as-is; expand local paths.
            # Use the same regex as _extract_protocol for consistency.
            is_cloud_path = bool(re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*(@[^:/]+)?:/', d))
            path = d if is_cloud_path else os.path.abspath(os.path.expanduser(d))
            if path not in seen:
                seen.add(path)
                result.append(path)
        return result

    @staticmethod
    def _parse_skills(skills: Optional[Iterable[str]]) -> List[str]:
        if skills is None:
            return []
        items = [s.strip() for s in skills.split(',') if s.strip()] if isinstance(skills, str) else list(skills)
        seen: set = set()
        result = []
        for item in items:
            if item and item not in seen:
                seen.add(item)
                result.append(item)
        return result

    def _fs_read(self, path: str) -> str:
        with self._fs.open(path, 'r', encoding='utf-8', errors='replace') as f:
            return f.read()

    def _fs_getsize(self, path: str) -> int:
        return self._fs.info(path).get('size', 0)

    def _fs_listdir(self, path: str) -> List[Dict]:
        try:
            return self._fs.ls(path, detail=True)
        except Exception:
            return []

    def _fs_join(self, base: str, name: str) -> str:
        return base.rstrip('/') + '/' + name

    def _iter_skill_files(self) -> Iterable[Tuple[str, str]]:
        for base_dir in self._skills_dir:
            stack = [base_dir]
            while stack:
                cur = stack.pop()
                entries = self._fs_listdir(cur)
                skill_node = None
                subdirs = []
                for entry in entries:
                    name = entry.get('name', '')
                    basename = name.rsplit('/', 1)[-1]
                    full_path = name if '/' in name else self._fs_join(cur, basename)
                    etype = entry.get('type', 'file')
                    if etype not in ('directory', 'dir'):
                        # match any name starting with SKILL_PREFIX
                        if basename.startswith(SKILL_PREFIX):
                            skill_node = full_path
                    elif etype in ('directory', 'dir'):
                        subdirs.append(full_path)
                if skill_node:
                    yield cur, skill_node
                else:
                    for subdir in reversed(subdirs):
                        stack.append(subdir)

    @staticmethod
    def _extract_yaml_meta(text: str) -> Optional[dict]:
        lines = text.splitlines()
        start_idx = end_idx = None
        for idx, line in enumerate(lines):
            if line.strip() == '---':
                if start_idx is None:
                    start_idx = idx
                    continue
                end_idx = idx
                break
        if start_idx is None or end_idx is None or end_idx <= start_idx:
            return None
        try:
            meta = yaml.safe_load('\n'.join(lines[start_idx + 1:end_idx])) or {}
        except yaml.YAMLError:
            return None
        return meta if isinstance(meta, dict) else None

    @staticmethod
    def _is_meta_valid(meta: dict) -> bool:
        return meta is not None and _META_REQUIRED_FIELDS.issubset(meta.keys())

    def _load_skills_index(self) -> None:
        if self._skills_index:
            return
        with self._skills_index_lock:
            if self._skills_index:
                return
            skills_index: Dict[str, Dict] = {}
            seen: set = set()
            for skill_dir, skill_md in self._iter_skill_files():
                if self._fs_getsize(skill_md) > self._max_skill_md_bytes:
                    continue
                try:
                    content = self._fs_read(skill_md)
                except Exception:
                    continue
                meta = self._extract_yaml_meta(content)
                if not self._is_meta_valid(meta):
                    continue
                name = meta.get('name')
                if not name or name in seen:
                    continue
                seen.add(name)
                skills_index[name] = {
                    'name': name,
                    'description': meta.get('description', ''),
                    'argument-hint': meta.get('argument-hint', ''),
                    'disable-model-invocation': self._to_bool(meta.get('disable-model-invocation', False)),
                    'user-invocable': self._to_bool(meta.get('user-invocable', True)),
                    'allowed-tools': meta.get('allowed-tools'),
                    'source': self._extract_protocol(skill_dir) or 'file',
                    'path': skill_dir,
                    'skill_md': skill_md,
                    'raw_meta': meta,
                }
            self._skills_index = skills_index
            if self._skills_expected:
                self._skills_selected = [n for n in self._skills_expected if n in self._skills_index]
            else:
                self._skills_selected = [
                    n for n, info in self._skills_index.items() if not info.get('disable-model-invocation')
                ]

    def list_skill(self) -> str:
        """List available skills under configured directories and return a Markdown string.

**Returns:**

- str: Skill list with name/description/path.
"""
        self._load_skills_index()
        lines = ['# Skills', '', '## Skill Locations']
        lines.extend([f'- {path}' for path in self._skills_dir] or ['- (none)'])
        lines += ['', '## Available Skills']
        if not self._skills_index:
            lines.append('- (none)')
            return '\n'.join(lines)
        for name, info in self._skills_index.items():
            desc = (info.get('description', '') or '')[:1024]
            lines += [
                f'- **{name}**',
                f'  - {desc}',
                f'  - Source: {info.get("source", "file")}',
                f'  - Path: {info.get("path")}',
            ]
        return '\n'.join(lines)

    def build_prompt(self) -> str:
        """Build a skills guide prompt.

**Returns:**

- str: Composed system prompt.
"""
        self._load_skills_index()
        skills_list = self._format_skills_list(list(self._skills_index.keys()))
        lines = ['**Skills Directory**']
        if self._skills_dir:
            lines.append(self._format_skills_locations())
        lines += ['**Available Skills**', skills_list or '- (none)']
        return f'{SKILLS_PROMPT}\n\n' + '\n'.join(lines)

    @staticmethod
    def _to_bool(value) -> bool:
        if isinstance(value, bool):
            return value
        if isinstance(value, str):
            return value.strip().lower() in ('true', '1', 'yes', 'y', 'on')
        return bool(value) if value is not None else False

    def get_skill(self, name: str, allow_large: bool = False) -> Dict[str, str]:
        """Load the full SKILL.md content for a skill.

Args:
    name (str): Skill name.
    allow_large (bool): Whether to allow loading oversized files.

**Returns:**

- dict: Result with status, path, and content.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        skill_md = info['skill_md']
        size = self._fs_getsize(skill_md)
        if size > self._max_skill_md_bytes and not allow_large:
            return {'status': 'too_large', 'name': name, 'path': skill_md,
                    'size': size, 'limit': self._max_skill_md_bytes}
        try:
            content = self._fs_read(skill_md)
        except Exception as e:
            return {'status': 'error', 'name': name, 'error': str(e)}
        return {'status': 'ok', 'name': name, 'path': skill_md, 'content': content}

    def read_file(self, name: str, rel_path: str, **kwargs) -> Dict[str, str]:
        """Read a file under a skill directory by relative path.

Args:
    name (str): Skill name.
    rel_path (str): Relative path.

**Returns:**

- dict: Read result.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        base = info['path']
        path = self._fs_join(base, rel_path)
        try:
            return {'status': 'ok', 'path': path, 'content': self._fs_read(path)}
        except Exception as e:
            return {'status': 'error', 'path': path, 'error': str(e)}

    def run_script(self, name: str, rel_path: str, args: Optional[List[str]] = None,
                   allow_unsafe: bool = False, cwd: Optional[str] = None) -> Dict[str, str]:
        """Run a script under a skill directory.

Args:
    name (str): Skill name.
    rel_path (str): Script relative path.
    args (List[str], optional): Script arguments.
    allow_unsafe (bool): Whether to allow potentially unsafe execution.
    cwd (str, optional): Working directory.

**Returns:**

- dict: Execution result.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        base = info['path']
        script_path = os.path.join(base, rel_path)
        if self._extract_protocol(script_path):
            return {'status': 'error', 'error': 'run_script is not supported for cloud FS skills'}
        if not self._fs.exists(script_path):
            return {'status': 'missing', 'path': script_path}
        ext = os.path.splitext(script_path)[1].lower()
        cmd = ['python' if ext == '.py' else 'bash' if ext in ('.sh', '.bash') else 'sh', script_path]
        if args:
            cmd.extend(args)
        return _shell_tool(' '.join(shlex.quote(p) for p in cmd), cwd=cwd or base, allow_unsafe=allow_unsafe)

    def read_reference(self, name: str, rel_path: str, **kwargs) -> Dict[str, str]:
        """Read a reference file in a skill directory (alias wrapper).

Args:
    name (str): Skill name.
    rel_path (str): Relative path.

**Returns:**

- dict: Read result.
"""
        return self.read_file(name=name, rel_path=rel_path, **kwargs)

    def get_skill_tools(self) -> List:
        """Return the skill tool callables exposed by SkillManager.

**Returns:**

- List[Callable]: Skill tool callables.
"""
        return [self._build_get_skill_tool(), self._build_read_reference_tool(), self._build_run_script_tool()]

    def _build_get_skill_tool(self):
        def get_skill(name: str, allow_large: bool = False) -> dict:
            """Get the full usage for a skill (SKILL.md).

            Args:
                name (str): Skill name.
                allow_large (bool, optional): Allow loading large SKILL.md. Defaults to False.
            """
            return self.get_skill(name=name, allow_large=allow_large)
        return get_skill

    def _build_read_reference_tool(self):
        def read_reference(name: str, rel_path: str, **kwargs) -> dict:
            """Read a reference file within a skill directory.

            Args:
                name (str): Skill name.
                rel_path (str): Relative file path inside the skill directory.
            """
            return self.read_reference(name=name, rel_path=rel_path, **kwargs)
        return read_reference

    def _build_run_script_tool(self):
        def run_script(name: str, rel_path: str, args: Optional[List[str]] = None,
                       allow_unsafe: bool = False, cwd: Optional[str] = None) -> dict:
            """Run a script within a skill directory.

            Args:
                name (str): Skill name.
                rel_path (str): Relative script path inside the skill directory.
                args (list[str], optional): Script arguments.
                allow_unsafe (bool, optional): Allow execution. Defaults to False.
                cwd (str, optional): Working directory.
            """
            return self.run_script(name=name, rel_path=rel_path, args=args,
                                   allow_unsafe=allow_unsafe, cwd=cwd)
        return run_script

    def _format_skills_list(self, names: List[str]) -> str:
        lines = []
        for name in names:
            info = self._skills_index.get(name)
            if info:
                desc = (info.get('description', '') or '')[:1024]
                lines.append(
                    f'- {name}: {desc} (source: {info.get("source", "file")}, '
                    f'path: {info.get("path")})'
                )
        return '\n'.join(lines)

    def _format_skills_locations(self) -> str:
        return '\n'.join(f'- {path}' for path in self._skills_dir)

`build_prompt()`

Build a skills guide prompt.

Returns:

str: Composed system prompt.

Source code in lazyllm/tools/agent/skill_manager.py

    def build_prompt(self) -> str:
        """Build a skills guide prompt.

**Returns:**

- str: Composed system prompt.
"""
        self._load_skills_index()
        skills_list = self._format_skills_list(list(self._skills_index.keys()))
        lines = ['**Skills Directory**']
        if self._skills_dir:
            lines.append(self._format_skills_locations())
        lines += ['**Available Skills**', skills_list or '- (none)']
        return f'{SKILLS_PROMPT}\n\n' + '\n'.join(lines)

`get_skill(name, allow_large=False)`

Load the full SKILL.md content for a skill.

Parameters:

name (str) –

Skill name.
allow_large (bool, default: False ) –

Whether to allow loading oversized files.

Returns:

dict: Result with status, path, and content.

Source code in lazyllm/tools/agent/skill_manager.py

    def get_skill(self, name: str, allow_large: bool = False) -> Dict[str, str]:
        """Load the full SKILL.md content for a skill.

Args:
    name (str): Skill name.
    allow_large (bool): Whether to allow loading oversized files.

**Returns:**

- dict: Result with status, path, and content.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        skill_md = info['skill_md']
        size = self._fs_getsize(skill_md)
        if size > self._max_skill_md_bytes and not allow_large:
            return {'status': 'too_large', 'name': name, 'path': skill_md,
                    'size': size, 'limit': self._max_skill_md_bytes}
        try:
            content = self._fs_read(skill_md)
        except Exception as e:
            return {'status': 'error', 'name': name, 'error': str(e)}
        return {'status': 'ok', 'name': name, 'path': skill_md, 'content': content}

`get_skill_tools()`

Return the skill tool callables exposed by SkillManager.

Returns:

List[Callable]: Skill tool callables.

Source code in lazyllm/tools/agent/skill_manager.py

    def get_skill_tools(self) -> List:
        """Return the skill tool callables exposed by SkillManager.

**Returns:**

- List[Callable]: Skill tool callables.
"""
        return [self._build_get_skill_tool(), self._build_read_reference_tool(), self._build_run_script_tool()]

`list_skill()`

List available skills under configured directories and return a Markdown string.

Returns:

str: Skill list with name/description/path.

Source code in lazyllm/tools/agent/skill_manager.py

    def list_skill(self) -> str:
        """List available skills under configured directories and return a Markdown string.

**Returns:**

- str: Skill list with name/description/path.
"""
        self._load_skills_index()
        lines = ['# Skills', '', '## Skill Locations']
        lines.extend([f'- {path}' for path in self._skills_dir] or ['- (none)'])
        lines += ['', '## Available Skills']
        if not self._skills_index:
            lines.append('- (none)')
            return '\n'.join(lines)
        for name, info in self._skills_index.items():
            desc = (info.get('description', '') or '')[:1024]
            lines += [
                f'- **{name}**',
                f'  - {desc}',
                f'  - Source: {info.get("source", "file")}',
                f'  - Path: {info.get("path")}',
            ]
        return '\n'.join(lines)

`read_file(name, rel_path, **kwargs)`

Read a file under a skill directory by relative path.

Parameters:

name (str) –

Skill name.
rel_path (str) –

Relative path.

Returns:

dict: Read result.

Source code in lazyllm/tools/agent/skill_manager.py

    def read_file(self, name: str, rel_path: str, **kwargs) -> Dict[str, str]:
        """Read a file under a skill directory by relative path.

Args:
    name (str): Skill name.
    rel_path (str): Relative path.

**Returns:**

- dict: Read result.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        base = info['path']
        path = self._fs_join(base, rel_path)
        try:
            return {'status': 'ok', 'path': path, 'content': self._fs_read(path)}
        except Exception as e:
            return {'status': 'error', 'path': path, 'error': str(e)}

`read_reference(name, rel_path, **kwargs)`

Read a reference file in a skill directory (alias wrapper).

Parameters:

name (str) –

Skill name.
rel_path (str) –

Relative path.

Returns:

dict: Read result.

Source code in lazyllm/tools/agent/skill_manager.py

    def read_reference(self, name: str, rel_path: str, **kwargs) -> Dict[str, str]:
        """Read a reference file in a skill directory (alias wrapper).

Args:
    name (str): Skill name.
    rel_path (str): Relative path.

**Returns:**

- dict: Read result.
"""
        return self.read_file(name=name, rel_path=rel_path, **kwargs)

`run_script(name, rel_path, args=None, allow_unsafe=False, cwd=None)`

Run a script under a skill directory.

Parameters:

name (str) –

Skill name.
rel_path (str) –

Script relative path.
args (List[str], default: None ) –

Script arguments.
allow_unsafe (bool, default: False ) –

Whether to allow potentially unsafe execution.
cwd (str, default: None ) –

Working directory.

Returns:

dict: Execution result.

Source code in lazyllm/tools/agent/skill_manager.py

    def run_script(self, name: str, rel_path: str, args: Optional[List[str]] = None,
                   allow_unsafe: bool = False, cwd: Optional[str] = None) -> Dict[str, str]:
        """Run a script under a skill directory.

Args:
    name (str): Skill name.
    rel_path (str): Script relative path.
    args (List[str], optional): Script arguments.
    allow_unsafe (bool): Whether to allow potentially unsafe execution.
    cwd (str, optional): Working directory.

**Returns:**

- dict: Execution result.
"""
        self._load_skills_index()
        info = self._skills_index.get(name)
        if not info:
            return {'status': 'missing', 'name': name}
        base = info['path']
        script_path = os.path.join(base, rel_path)
        if self._extract_protocol(script_path):
            return {'status': 'error', 'error': 'run_script is not supported for cloud FS skills'}
        if not self._fs.exists(script_path):
            return {'status': 'missing', 'path': script_path}
        ext = os.path.splitext(script_path)[1].lower()
        cmd = ['python' if ext == '.py' else 'bash' if ext in ('.sh', '.bash') else 'sh', script_path]
        if args:
            cmd.extend(args)
        return _shell_tool(' '.join(shlex.quote(p) for p in cmd), cwd=cwd or base, allow_unsafe=allow_unsafe)

`lazyllm.tools.FunctionCall`

Bases: ModuleBase

FunctionCall is a single-turn tool invocation class. It is used when the LLM alone cannot answer user queries and requires external knowledge through tool calls. If the LLM output requires tool calls, the tools are invoked and the combined results (input, model output, tool output) are returned as a list. If no tool calls are needed, the LLM output is returned directly as a string.

Parameters:

llm (ModuleBase) –

The LLM instance to use, which can be either a TrainableModule or OnlineChatModule.
tools (List[Union[str, Callable]], default: None ) –

A list of tool names or callable objects that the LLM can use.
return_trace (Optional[bool], default: False ) –

Whether to return the invocation trace, defaults to False.
stream (Optional[bool], default: False ) –

Whether to enable streaming output, defaults to False.
_prompt (Optional[str], default: None ) –

Custom prompt for function call, defaults to automatic selection based on llm type.

Note: Tools in tools must include a __doc__ attribute and describe their purpose and parameters according to the Google Python Style.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import fc_register, FunctionCall
>>> import json
>>> from typing import Literal
>>> @fc_register("tool")
>>> def get_current_weather(location: str, unit: Literal["fahrenheit", "celsius"] = 'fahrenheit'):
...     '''
...     Get the current weather in a given location
...
...     Args:
...         location (str): The city and state, e.g. San Francisco, CA.
...         unit (str): The temperature unit to use. Infer this from the users location.
...     '''
...     if 'tokyo' in location.lower():
...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius'})
...     elif 'san francisco' in location.lower():
...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit'})
...     elif 'paris' in location.lower():
...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius'})
...     else:
...         return json.dumps({'location': location, 'temperature': 'unknown'})
...
>>> @fc_register("tool")
>>> def get_n_day_weather_forecast(location: str, num_days: int, unit: Literal["celsius", "fahrenheit"] = 'fahrenheit'):
...     '''
...     Get an N-day weather forecast
...
...     Args:
...         location (str): The city and state, e.g. San Francisco, CA.
...         num_days (int): The number of days to forecast.
...         unit (Literal['celsius', 'fahrenheit']): The temperature unit to use. Infer this from the users location.
...     '''
...     if 'tokyo' in location.lower():
...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius', "num_days": num_days})
...     elif 'san francisco' in location.lower():
...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit', "num_days": num_days})
...     elif 'paris' in location.lower():
...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius', "num_days": num_days})
...     else:
...         return json.dumps({'location': location, 'temperature': 'unknown'})
...
>>> tools=["get_current_weather", "get_n_day_weather_forecast"]
>>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()  # or llm = lazyllm.OnlineChatModule("openai", stream=False)
>>> query = "What's the weather like today in celsius in Tokyo."
>>> fc = FunctionCall(llm, tools)
>>> ret = fc(query)
>>> print(ret)
["What's the weather like today in celsius in Tokyo.", {'role': 'assistant', 'content': '
', 'tool_calls': [{'id': 'da19cddac0584869879deb1315356d2a', 'type': 'function', 'function': {'name': 'get_current_weather', 'arguments': {'location': 'Tokyo', 'unit': 'celsius'}}}]}, [{'role': 'tool', 'content': '{"location": "Tokyo", "temperature": "10", "unit": "celsius"}', 'tool_call_id': 'da19cddac0584869879deb1315356d2a', 'name': 'get_current_weather'}]]
>>> query = "Hello"
>>> ret = fc(query)
>>> print(ret)
'Hello! How can I assist you today?'

Source code in lazyllm/tools/agent/functionCall.py

class FunctionCall(ModuleBase):
    """FunctionCall is a single-turn tool invocation class. It is used when the LLM alone cannot answer user queries and requires external knowledge through tool calls.
If the LLM output requires tool calls, the tools are invoked and the combined results (input, model output, tool output) are returned as a list.
If no tool calls are needed, the LLM output is returned directly as a string.

Args:
    llm (ModuleBase): The LLM instance to use, which can be either a TrainableModule or OnlineChatModule.
    tools (List[Union[str, Callable]]): A list of tool names or callable objects that the LLM can use.
    return_trace (Optional[bool]): Whether to return the invocation trace, defaults to False.
    stream (Optional[bool]): Whether to enable streaming output, defaults to False.
    _prompt (Optional[str]): Custom prompt for function call, defaults to automatic selection based on llm type.

Note: Tools in `tools` must include a `__doc__` attribute and describe their purpose and parameters according to the [Google Python Style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings).


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import fc_register, FunctionCall
    >>> import json
    >>> from typing import Literal
    >>> @fc_register("tool")
    >>> def get_current_weather(location: str, unit: Literal["fahrenheit", "celsius"] = 'fahrenheit'):
    ...     '''
    ...     Get the current weather in a given location
    ...
    ...     Args:
    ...         location (str): The city and state, e.g. San Francisco, CA.
    ...         unit (str): The temperature unit to use. Infer this from the users location.
    ...     '''
    ...     if 'tokyo' in location.lower():
    ...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius'})
    ...     elif 'san francisco' in location.lower():
    ...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit'})
    ...     elif 'paris' in location.lower():
    ...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius'})
    ...     else:
    ...         return json.dumps({'location': location, 'temperature': 'unknown'})
    ...
    >>> @fc_register("tool")
    >>> def get_n_day_weather_forecast(location: str, num_days: int, unit: Literal["celsius", "fahrenheit"] = 'fahrenheit'):
    ...     '''
    ...     Get an N-day weather forecast
    ...
    ...     Args:
    ...         location (str): The city and state, e.g. San Francisco, CA.
    ...         num_days (int): The number of days to forecast.
    ...         unit (Literal['celsius', 'fahrenheit']): The temperature unit to use. Infer this from the users location.
    ...     '''
    ...     if 'tokyo' in location.lower():
    ...         return json.dumps({'location': 'Tokyo', 'temperature': '10', 'unit': 'celsius', "num_days": num_days})
    ...     elif 'san francisco' in location.lower():
    ...         return json.dumps({'location': 'San Francisco', 'temperature': '72', 'unit': 'fahrenheit', "num_days": num_days})
    ...     elif 'paris' in location.lower():
    ...         return json.dumps({'location': 'Paris', 'temperature': '22', 'unit': 'celsius', "num_days": num_days})
    ...     else:
    ...         return json.dumps({'location': location, 'temperature': 'unknown'})
    ...
    >>> tools=["get_current_weather", "get_n_day_weather_forecast"]
    >>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()  # or llm = lazyllm.OnlineChatModule("openai", stream=False)
    >>> query = "What's the weather like today in celsius in Tokyo."
    >>> fc = FunctionCall(llm, tools)
    >>> ret = fc(query)
    >>> print(ret)
    ["What's the weather like today in celsius in Tokyo.", {'role': 'assistant', 'content': '
    ', 'tool_calls': [{'id': 'da19cddac0584869879deb1315356d2a', 'type': 'function', 'function': {'name': 'get_current_weather', 'arguments': {'location': 'Tokyo', 'unit': 'celsius'}}}]}, [{'role': 'tool', 'content': '{"location": "Tokyo", "temperature": "10", "unit": "celsius"}', 'tool_call_id': 'da19cddac0584869879deb1315356d2a', 'name': 'get_current_weather'}]]
    >>> query = "Hello"
    >>> ret = fc(query)
    >>> print(ret)
    'Hello! How can I assist you today?'
    """

    def __init__(self, llm, tools: Optional[List[Union[str, Callable]]] = None, *, return_trace: bool = False,
                 stream: bool = False, _prompt: str = None, _tool_manager: Optional[ToolManager] = None,
                 skill_manager=None, sandbox: Optional[LazyLLMSandboxBase] = None,
                 keep_full_turns: int = 0):
        super().__init__(return_trace=return_trace)
        if _tool_manager is None:
            assert tools, 'tools cannot be empty.'
            self._sandbox = sandbox or create_sandbox()
            self._tools_manager = ToolManager(tools, return_trace=return_trace, sandbox=self._sandbox)
        else:
            self._tools_manager = _tool_manager
            self._sandbox = _tool_manager.sandbox
        self._skill_manager = skill_manager
        self._stream = stream
        self._keep_full_turns = keep_full_turns
        prompt = _prompt or FC_PROMPT
        self._prompter = ChatPrompter(
            instruction={'system': prompt, 'user': ''},
            tools=lambda: self._tools_manager.tools_description,
            skills=self._skill_manager.build_prompt() if self._skill_manager else '',
        )
        self._llm = llm.share(
            prompt=self._prompter,
            format=FunctionCallFormatter(),
            stream=stream,
        ).used_by(self._module_id)
        with pipeline() as self._impl:
            self._impl.pre_action = self._build_history
            self._impl.llm = self._llm
            self._impl.post_action = self._post_action

    @property
    def sandbox(self) -> LazyLLMSandboxBase:
        return self._sandbox

    @sandbox.setter
    def sandbox(self, sandbox: Optional[LazyLLMSandboxBase]):
        self._sandbox = sandbox
        if hasattr(self, '_tools_manager') and self._tools_manager is not None:
            self._tools_manager.sandbox = sandbox

    def _build_history(self, input: Union[str, dict, list]):
        workspace = locals['_lazyllm_agent']['workspace']
        history_idx = len(workspace.setdefault('history', []))
        if isinstance(input, str):
            workspace['history'].append({'role': 'user', 'content': input})
        elif isinstance(input, dict) and 'input' in input:
            workspace['history'].append(
                {'role': 'user', 'content': input.get('input', '')}
            )
        elif isinstance(input, dict) and input.get('role') == 'user':
            workspace['history'].append(
                {'role': 'user', 'content': input.get('content', '')}
            )
        elif isinstance(input, dict):
            tool_call_results = [
                {
                    'role': 'tool',
                    'content': str(tool_call['tool_call_result']),
                    'tool_call_id': tool_call['id'],
                    'name': tool_call['function']['name'],
                } for tool_call in workspace['tool_call_trace']
            ]
            workspace['history'].append({
                'role': 'assistant',
                'content': input.get('content', ''),
                'tool_calls': input.get('tool_calls', []),
                'reasoning_content': input.get('reasoning_content', ''),
            })
            input = {'input': tool_call_results}
            history_idx += 1
            workspace['history'].extend(tool_call_results)
        chat_history = workspace['history'][:history_idx]
        if self._keep_full_turns > 0:
            chat_history = _compact_chat_history(chat_history, self._keep_full_turns)
        locals['chat_history'][self._llm._module_id] = chat_history
        return input

    def _post_action(self, llm_output: Dict[str, Any]):
        if not llm_output.get('tool_calls'):
            if (match := re.search(r'Action:\s*Call\s+(\w+)\s+with\s+parameters\s+(\{.*?\})', llm_output['content'])):
                try:
                    llm_output['tool_calls'] = [{'function': {'name': match.group(1),
                                                              'arguments': json.loads(match.group(2))}}]
                except Exception: pass
        if tool_calls := llm_output.get('tool_calls'):
            if isinstance(tool_calls, list): [item.pop('index', None) for item in tool_calls]
            if self._stream:
                _write_agent_data('tool_calls', tool_calls=tool_calls)
            tool_calls_results = self._tools_manager(tool_calls)
            if self._stream:
                _write_agent_data('tool_results',
                                  tool_results=LazyLLMAgentBase._normalize_tool_results(tool_calls,
                                                                                        tool_calls_results))
            locals['_lazyllm_agent']['workspace']['tool_call_trace'] = [
                {**tool_call, 'tool_call_result': tool_result}
                for tool_call, tool_result in zip(tool_calls, tool_calls_results)
            ]
        else:
            llm_output = llm_output['content']
        return llm_output

    def forward(self, input: str, llm_chat_history: List[Dict[str, Any]] = None):
        if 'workspace' not in locals['_lazyllm_agent']:
            locals['_lazyllm_agent']['workspace'] = dict(history=llm_chat_history or [])
        result = self._impl(input)

        # If the model decides not to call any tools, the result is a string. For debugging and subsequent tasks,
        # the last non-empty tool call trace is stored in locals['_lazyllm_agent']['completed']
        # and history is stored in locals['_lazyllm_agent']['history'].
        if isinstance(result, str):
            workspace = locals['_lazyllm_agent'].pop('workspace', {})
            locals['_lazyllm_agent']['completed'] = workspace.pop(
                'tool_call_trace', locals['_lazyllm_agent'].get('completed', []))
            locals['_lazyllm_agent']['history'] = workspace.pop('history', [])
            locals['chat_history'][self._llm._module_id] = []
        return result

`lazyllm.tools.ReactAgent`

Bases: LazyLLMAgentBase

ReactAgent follows the Thought->Action->Observation->Thought...->Finish loop to solve user tasks step by step through LLM reasoning and tool calls, then delivers a final answer.

Parameters:

llm –

The large language model instance used for reasoning and tool-call decisions.
tools (List, default: None ) –
List of available tools. Each element can be one of the following:
- str: Name of a registered tool, e.g. "multiply_tool".
- Callable: A plain function passed directly; it is temporarily registered as a tool.
- ModuleTool instance: A pre-constructed tool object used as-is.
- Object instance with __public_apis__: Passed directly; each method in __public_apis__ is expanded into a separate tool automatically. If the class inherits from CredentialMixin, get_current_token() is automatically used as the credential source and the group is hidden when the token is empty. Otherwise, if the class defines a __key_source__ attribute, it is used as the credential source; if neither is defined the tools are always available.
- (instance, key_source) tuple: Registers an object instance that declares __public_apis__: List[str] as a group of tools, with a runtime credential source attached. Each method listed in __public_apis__ is expanded into a separate tool. key_source accepts a string ('env.XXX', 'config.xxx', 'globals.config.xxx'; no dot means globals.config.xxx), a callable (e.g. lambda inst: inst._key), or a list of the above (available if any resolves to a non-empty value). When the credential is absent, all tools from that instance are hidden from the LLM's visible tool list and will not be called.
- dict: Defines a tool group inline, with the format dict(name='grp', desc='...', tools=[...], lazy=True). name and tools are required fields. When lazy=True (default), only a get_<name>_methods gateway tool is initially exposed; after the LLM calls it, child tool descriptions are dynamically injected into the system prompt. When lazy=False, all child tools are expanded immediately. The optional key_source field binds a runtime credential source with the same semantics as in a (instance, key_source) tuple; when the credential is absent the entire group is hidden from the LLM. Multi-level nesting is supported by embedding dict groups inside tools.
max_retries (int, default: 5 ) –

Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. When exceeded, the force-summarize fallback is triggered (if enabled) or an exception is raised. Defaults to 5.
return_trace (bool, default: False ) –

Whether to return the full execution trace for debugging and analysis. Defaults to False.
prompt (str, default: None ) –

Custom prompt template. If None, the built-in ReAct instruction template is used.
stream (bool, default: False ) –

Whether to enable streaming output for real-time display. Defaults to False.
return_last_tool_calls (bool, default: False ) –

If True, returns the last tool-call trace when the model finishes with pending tool calls.
skills (bool | str | List[str], default: None ) –

Skills configuration. True enables Skills with automatic selection; a str/list enables the specified skills.
desc (str, default: '' ) –

Description of the agent capabilities. Can be empty.
workspace (str, default: None ) –

Default working directory for the agent. Defaults to config['home']/agent_workspace.
force_summarize (bool, default: False ) –

When True, if the agent has not produced a final answer after max_retries + 1 tool-call iterations, one additional LLM call is made with the full conversation history plus a force-summarize instruction, asking the model to stop tool calls and output its final answer immediately. If False (default), a ValueError is raised instead. Useful when the task involves many tool-call steps and the LLM struggles to stop on its own.
force_summarize_context (str, default: '' ) –

Extra context injected into the force-summarize prompt (e.g. the original task description). Defaults to empty string.
keep_full_turns (int, default: 0 ) –

Number of most-recent tool results to keep intact during history compaction. Older results are truncated to 200 chars. Defaults to 0 (all results compacted).

Examples:

>>> import lazyllm
>>> from lazyllm.tools import fc_register, ReactAgent
>>> @fc_register("tool")
>>> def multiply_tool(a: int, b: int) -> int:
...     '''
...     Multiply two integers and return the result integer
...
...     Args:
...         a (int): multiplier
...         b (int): multiplier
...     '''
...     return a * b
...
>>> @fc_register("tool")
>>> def add_tool(a: int, b: int):
...     '''
...     Add two integers and returns the result integer
...
...     Args:
...         a (int): addend
...         b (int): addend
...     '''
...     return a + b
...
>>> tools = ["multiply_tool", "add_tool"]
>>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()   # or llm = lazyllm.OnlineChatModule(source="sensenova")
>>> agent = ReactAgent(llm, tools)
>>> query = "What is 20+(2*4)? Calculate step by step."
>>> res = agent(query)
>>> print(res)
'Answer: The result of 20+(2*4) is 28.'

>>> # Using (instance, key_source) tuple to register a class with __public_apis__
>>> class BingSearch:
...     __public_apis__ = ["search"]
...     def __init__(self, subscription_key: str = ""):
...         self._key = subscription_key
...     def search(self, query: str) -> str:
...         '''Search the web using Bing.
...
...         Args:
...             query (str): The search query string.
...
...         Returns:
...             str: Search results.
...         '''
...         return f"bing results for: {query}"
...
>>> bing = BingSearch()
>>> agent2 = ReactAgent(llm, tools=[(bing, 'globals.bing_key')])
>>> # Without a key, the tool is hidden from the LLM
>>> len(agent2._tools_manager.tools_description)
0
>>> lazyllm.globals['bing_key'] = 'my-subscription-key'
>>> # Now the tool becomes visible
>>> len(agent2._tools_manager.tools_description)
1
>>> res2 = agent2("Search for the latest LazyLLM release.")

Source code in lazyllm/tools/agent/reactAgent.py

class ReactAgent(LazyLLMAgentBase):
    """ReactAgent follows the `Thought->Action->Observation->Thought...->Finish` loop to solve user tasks step by step through LLM reasoning and tool calls, then delivers a final answer.

Args:
    llm: The large language model instance used for reasoning and tool-call decisions.
    tools (List): List of available tools. Each element can be one of the following:

        - ``str``: Name of a registered tool, e.g. ``"multiply_tool"``.
        - ``Callable``: A plain function passed directly; it is temporarily registered as a tool.
        - ``ModuleTool`` instance: A pre-constructed tool object used as-is.
        - Object instance with ``__public_apis__``: Passed directly; each method in ``__public_apis__`` is expanded into a separate tool automatically.
          If the class inherits from ``CredentialMixin``, ``get_current_token()`` is automatically used as the credential source and the group is hidden when the token is empty.
          Otherwise, if the class defines a ``__key_source__`` attribute, it is used as the credential source; if neither is defined the tools are always available.
        - ``(instance, key_source)`` tuple: Registers an object instance that declares ``__public_apis__: List[str]`` as a group of tools, with a runtime credential source attached.
          Each method listed in ``__public_apis__`` is expanded into a separate tool.
          ``key_source`` accepts a string (``'env.XXX'``, ``'config.xxx'``, ``'globals.config.xxx'``; no dot means ``globals.config.xxx``),
          a callable (e.g. ``lambda inst: inst._key``), or a list of the above (available if any resolves to a non-empty value).
          When the credential is absent, all tools from that instance are hidden from the LLM's visible tool list and will not be called.
        - ``dict``: Defines a tool group inline, with the format ``dict(name='grp', desc='...', tools=[...], lazy=True)``.
          ``name`` and ``tools`` are required fields. When ``lazy=True`` (default), only a ``get_<name>_methods`` gateway tool is initially
          exposed; after the LLM calls it, child tool descriptions are dynamically injected into the system prompt. When ``lazy=False``,
          all child tools are expanded immediately.
          The optional ``key_source`` field binds a runtime credential source with the same semantics as in a ``(instance, key_source)`` tuple;
          when the credential is absent the entire group is hidden from the LLM.
          Multi-level nesting is supported by embedding ``dict`` groups inside ``tools``.

    max_retries (int): Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. When exceeded, the force-summarize fallback is triggered (if enabled) or an exception is raised. Defaults to 5.
    return_trace (bool): Whether to return the full execution trace for debugging and analysis. Defaults to False.
    prompt (str): Custom prompt template. If None, the built-in ReAct instruction template is used.
    stream (bool): Whether to enable streaming output for real-time display. Defaults to False.
    return_last_tool_calls (bool): If True, returns the last tool-call trace when the model finishes with pending tool calls.
    skills (bool | str | List[str]): Skills configuration. True enables Skills with automatic selection; a str/list enables the specified skills.
    desc (str): Description of the agent capabilities. Can be empty.
    workspace (str): Default working directory for the agent. Defaults to `config['home']/agent_workspace`.
    force_summarize (bool): When True, if the agent has not produced a final answer after max_retries + 1 tool-call iterations, one additional LLM call is made with the full conversation history plus a force-summarize instruction, asking the model to stop tool calls and output its final answer immediately. If False (default), a ValueError is raised instead.
        Useful when the task involves many tool-call steps and the LLM struggles to stop on its own.
    force_summarize_context (str): Extra context injected into the force-summarize prompt (e.g. the original task description). Defaults to empty string.
    keep_full_turns (int): Number of most-recent tool results to keep intact during history compaction. Older results are truncated to 200 chars. Defaults to 0 (all results compacted).



Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import fc_register, ReactAgent
    >>> @fc_register("tool")
    >>> def multiply_tool(a: int, b: int) -> int:
    ...     '''
    ...     Multiply two integers and return the result integer
    ...
    ...     Args:
    ...         a (int): multiplier
    ...         b (int): multiplier
    ...     '''
    ...     return a * b
    ...
    >>> @fc_register("tool")
    >>> def add_tool(a: int, b: int):
    ...     '''
    ...     Add two integers and returns the result integer
    ...
    ...     Args:
    ...         a (int): addend
    ...         b (int): addend
    ...     '''
    ...     return a + b
    ...
    >>> tools = ["multiply_tool", "add_tool"]
    >>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()   # or llm = lazyllm.OnlineChatModule(source="sensenova")
    >>> agent = ReactAgent(llm, tools)
    >>> query = "What is 20+(2*4)? Calculate step by step."
    >>> res = agent(query)
    >>> print(res)
    'Answer: The result of 20+(2*4) is 28.'

    >>> # Using (instance, key_source) tuple to register a class with __public_apis__
    >>> class BingSearch:
    ...     __public_apis__ = ["search"]
    ...     def __init__(self, subscription_key: str = ""):
    ...         self._key = subscription_key
    ...     def search(self, query: str) -> str:
    ...         '''Search the web using Bing.
    ...
    ...         Args:
    ...             query (str): The search query string.
    ...
    ...         Returns:
    ...             str: Search results.
    ...         '''
    ...         return f"bing results for: {query}"
    ...
    >>> bing = BingSearch()
    >>> agent2 = ReactAgent(llm, tools=[(bing, 'globals.bing_key')])
    >>> # Without a key, the tool is hidden from the LLM
    >>> len(agent2._tools_manager.tools_description)
    0
    >>> lazyllm.globals['bing_key'] = 'my-subscription-key'
    >>> # Now the tool becomes visible
    >>> len(agent2._tools_manager.tools_description)
    1
    >>> res2 = agent2("Search for the latest LazyLLM release.")
    """
    def __init__(self, llm, tools: Optional[List[Union[str, Callable, Dict]]] = None, max_retries: int = 5,
                 return_trace: bool = False,
                 prompt: str = None, stream: bool = False, return_last_tool_calls: bool = False,
                 skills: Optional[Union[bool, str, List[str]]] = None, desc: str = '',
                 workspace: Optional[str] = None, sandbox: Optional[LazyLLMSandboxBase] = None,
                 force_summarize: bool = False, force_summarize_context: str = '',
                 keep_full_turns: int = 0, fs: Optional[Any] = None, skills_dir: Optional[str] = None,
                 enable_builtin_tools: bool = True):
        super().__init__(llm=llm, tools=tools, max_retries=max_retries, return_trace=return_trace,
                         stream=stream, return_last_tool_calls=return_last_tool_calls, skills=skills,
                         desc=desc, workspace=workspace, sandbox=sandbox, fs=fs, skills_dir=skills_dir,
                         enable_builtin_tools=enable_builtin_tools)
        prompt = prompt or INSTRUCTION
        if self._return_last_tool_calls:
            prompt += '\nIf no more tool calls are needed, reply with ok and skip any summary.'
        assert self._llm is not None, 'llm cannot be empty.'
        self._assert_tools()
        self._prompt = self._append_workspace_prompt(prompt)
        self._force_summarize = force_summarize
        self._force_summarize_context = force_summarize_context
        self._keep_full_turns = keep_full_turns

    @once_wrapper(reset_on_pickle=True)
    def build_agent(self):
        agent = loop(FunctionCall(llm=self._llm, _prompt=self._prompt, return_trace=self._return_trace,
                                  stream=self._stream, _tool_manager=self._tools_manager,
                                  skill_manager=self._skill_manager,
                                  keep_full_turns=self._keep_full_turns),
                     stop_condition=lambda x: isinstance(x, str), count=self._max_retries + 1)
        self._agent = agent

    def _pre_process(self, query: str, llm_chat_history: List[Dict[str, Any]] = None):
        return (query, llm_chat_history or [])

    def _force_summarize_from_history(self, history: list) -> Optional[str]:
        recent = history[-8:]
        obs_lines: List[str] = []
        for idx, m in enumerate(recent):
            role = m.get('role', '')
            raw_content = m.get('content')
            if raw_content is None:
                tool_calls = m.get('tool_calls') or []
                if tool_calls:
                    names = ', '.join(tc.get('function', {}).get('name', '?') for tc in tool_calls)
                    obs_lines.append(f'{role}: [called tools: {names}]')
                continue
            content_str = raw_content if isinstance(raw_content, str) else str(raw_content)
            limit = 1000 if idx == len(recent) - 1 else 300
            obs_lines.append(f'{role}: {content_str[:limit]}')
        obs_text = '\n'.join(obs_lines)
        ctx_prefix = (
            f'Original task context:\n{self._force_summarize_context[:500]}\n\n'
            if self._force_summarize_context else ''
        )
        summarize_prompt = (
            f'{ctx_prefix}'
            f'Based on the following recent observations from your tool exploration:\n'
            f'{obs_text}\n\n'
            f'{_FORCE_SUMMARIZE_MSG}'
        )
        summarize_llm = self._llm.share(stream=False)
        resp = summarize_llm(summarize_prompt)
        summary = resp if isinstance(resp, str) else (
            resp.get('content', '') if isinstance(resp, dict) else None
        )
        return summary if summary else None

    def _post_process(self, ret):
        if isinstance(ret, str):
            completed = self._pop_tool_calls()
            if completed is not None:
                return completed
            return ret
        if self._force_summarize:
            try:
                agent_ctx = locals['_lazyllm_agent']
            except (KeyError, TypeError):
                agent_ctx = {}
            history = agent_ctx.get('workspace', {}).get('history', []) if isinstance(agent_ctx, dict) else []
            if history:
                LOG.warning(f'ReactAgent reached max_retries={self._max_retries}, attempting force summarize.')
                summary = None
                try:
                    summary = self._force_summarize_from_history(history)
                except Exception as e:
                    LOG.warning(f'ReactAgent force-summarize call failed: {e}')
                if summary is not None:
                    return summary
        raise ValueError(f'After retrying {self._max_retries} times, the react agent still failes to call '
                         f'successfully.')

`lazyllm.tools.PlanAndSolveAgent`

Bases: LazyLLMAgentBase

PlanAndSolveAgent consists of two components. First, the planner breaks down the entire task into smaller subtasks, then the solver executes these subtasks according to the plan, which may involve tool calls, and finally returns the answer to the user.

Parameters:

llm (ModuleBase, default: None ) –

The LLM to be used can be TrainableModule or OnlineChatModule. It is mutually exclusive with plan_llm and solve_llm. Either set llm(the planner and solver share the same LLM), or set plan_llm and solve_llm,or only specify llm(to set the planner) and solve_llm. Other cases are considered invalid.
tools (List[str], default: [] ) –

A list of tool names for LLM to use.
plan_llm (ModuleBase, default: None ) –

The LLM to be used by the planner, which can be either TrainableModule or OnlineChatModule.
solve_llm (ModuleBase, default: None ) –

The LLM to be used by the solver, which can be either TrainableModule or OnlineChatModule.
max_retries (int, default: 5 ) –

Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. The default value is 5.
return_trace (bool, default: False ) –

If True, return intermediate steps and tool calls.
stream (bool, default: False ) –

Whether to stream the planning and solving process.
return_last_tool_calls (bool, default: False ) –

If True, return the last tool-call trace when the model finishes.
skills (bool | str | List[str], default: None ) –

Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
desc (str, default: '' ) –

Optional agent capability description.
workspace (str, default: None ) –

Default agent workspace path. Defaults to config['home']/agent_workspace.

Examples:

>>> import lazyllm
>>> from lazyllm.tools import fc_register, PlanAndSolveAgent
>>> @fc_register("tool")
>>> def multiply(a: int, b: int) -> int:
...     '''
...     Multiply two integers and return the result integer
...
...     Args:
...         a (int): multiplier
...         b (int): multiplier
...     '''
...     return a * b
...
>>> @fc_register("tool")
>>> def add(a: int, b: int):
...     '''
...     Add two integers and returns the result integer
...
...     Args:
...         a (int): addend
...         b (int): addend
...     '''
...     return a + b
...
>>> tools = ["multiply", "add"]
>>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()  # or llm = lazyllm.OnlineChatModule(source="sensenova")
>>> agent = PlanAndSolveAgent(llm, tools)
>>> query = "What is 20+(2*4)? Calculate step by step."
>>> res = agent(query)
>>> print(res)
'The final answer is 28.'

Source code in lazyllm/tools/agent/planAndSolveAgent.py

class PlanAndSolveAgent(LazyLLMAgentBase):
    """PlanAndSolveAgent consists of two components. First, the planner breaks down the entire task into smaller subtasks, then the solver executes these subtasks according to the plan, which may involve tool calls, and finally returns the answer to the user.

Args:
    llm (ModuleBase): The LLM to be used can be TrainableModule or OnlineChatModule. It is mutually exclusive with plan_llm and solve_llm. Either set llm(the planner and solver share the same LLM), or set plan_llm and solve_llm,or only specify llm(to set the planner) and solve_llm. Other cases are considered invalid.
    tools (List[str]): A list of tool names for LLM to use.
    plan_llm (ModuleBase): The LLM to be used by the planner, which can be either TrainableModule or OnlineChatModule.
    solve_llm (ModuleBase): The LLM to be used by the solver, which can be either TrainableModule or OnlineChatModule.
    max_retries (int): Maximum number of retries after the initial tool-call attempt. The agent may run up to max_retries + 1 tool-call iterations. The default value is 5.
    return_trace (bool): If True, return intermediate steps and tool calls.
    stream (bool): Whether to stream the planning and solving process.
    return_last_tool_calls (bool): If True, return the last tool-call trace when the model finishes.
    skills (bool | str | List[str]): Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
    desc (str): Optional agent capability description.
    workspace (str): Default agent workspace path. Defaults to `config['home']/agent_workspace`.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools import fc_register, PlanAndSolveAgent
    >>> @fc_register("tool")
    >>> def multiply(a: int, b: int) -> int:
    ...     '''
    ...     Multiply two integers and return the result integer
    ...
    ...     Args:
    ...         a (int): multiplier
    ...         b (int): multiplier
    ...     '''
    ...     return a * b
    ...
    >>> @fc_register("tool")
    >>> def add(a: int, b: int):
    ...     '''
    ...     Add two integers and returns the result integer
    ...
    ...     Args:
    ...         a (int): addend
    ...         b (int): addend
    ...     '''
    ...     return a + b
    ...
    >>> tools = ["multiply", "add"]
    >>> llm = lazyllm.TrainableModule("internlm2-chat-20b").start()  # or llm = lazyllm.OnlineChatModule(source="sensenova")
    >>> agent = PlanAndSolveAgent(llm, tools)
    >>> query = "What is 20+(2*4)? Calculate step by step."
    >>> res = agent(query)
    >>> print(res)
    'The final answer is 28.'
    """
    def __init__(self, llm: Union[ModuleBase, None] = None, tools: List[str] = [], *,  # noqa B006
                 plan_llm: Union[ModuleBase, None] = None, solve_llm: Union[ModuleBase, None] = None,
                 max_retries: int = 5, return_trace: bool = False, stream: bool = False,
                 return_last_tool_calls: bool = False,
                 skills: Union[bool, str, List[str], None] = None, desc: str = '',
                 workspace: Optional[str] = None, sandbox: Optional[LazyLLMSandboxBase] = None,
                 fs: Optional[Any] = None, skills_dir: Optional[str] = None,
                 enable_builtin_tools: bool = True):
        super().__init__(llm=llm, tools=tools, max_retries=max_retries,
                         return_trace=return_trace, stream=stream,
                         return_last_tool_calls=return_last_tool_calls,
                         skills=skills, desc=desc, workspace=workspace,
                         sandbox=sandbox, fs=fs, skills_dir=skills_dir,
                         enable_builtin_tools=enable_builtin_tools)
        self._assert_tools()
        plan_llm, solve_llm = self._normalize_llms(llm, plan_llm, solve_llm)
        self._init_planner_prompter()
        self._plan_llm = plan_llm.share(prompt=self._planner_prompter, stream=self._planner_stream)\
            .used_by(self._module_id)
        self._solve_llm = solve_llm.share().used_by(self._module_id)
        prompt = self._append_workspace_prompt(FC_PROMPT)
        self._fc = FunctionCall(llm=self._solve_llm, return_trace=return_trace, stream=stream,
                                _prompt=prompt, _tool_manager=self._tools_manager,
                                skill_manager=self._skill_manager)

    def _normalize_llms(self, llm, plan_llm, solve_llm):
        assert (llm is None and plan_llm and solve_llm) or (llm and plan_llm is None), (
            'Either specify only llm '
            'without specify plan and solve, or specify only plan and solve without specifying llm, or specify '
            'both llm and solve. Other situations are not allowed.'
        )
        plan_llm = plan_llm or llm
        solve_llm = solve_llm or llm
        return plan_llm, solve_llm

    def _init_planner_prompter(self):
        planner_prompt = self._build_planner_prompt()
        self._planner_prompter = ChatPrompter(
            instruction={'system': planner_prompt, 'user': ''},
            skills=self._skill_manager.build_prompt() if self._skill_manager else '',
        )
        self._planner_stream = dict(prefix='I will give a plan first:\n', prefix_color=Color.blue,
                                    color=Color.green) if self._stream else False

    @once_wrapper(reset_on_pickle=True)
    def build_agent(self):
        with pipeline() as agent:
            agent.plan = self._plan_query
            with loop(stop_condition=lambda pre, res, steps, query: len(steps) == 0) as agent.lp:
                agent.lp.pre_action = self._pre_action
                agent.lp.solve = loop(self._fc, stop_condition=lambda x: isinstance(x, str),
                                      count=self._max_retries + 1)
                agent.lp.post_action = self._post_action | bind(agent.lp.input[0][0], _0,
                                                                agent.lp.input[0][2],
                                                                agent.lp.input[0][3])

            agent.final_action = lambda pre, res, steps, query: res
        self._agent = agent

    def _plan_query(self, query: str):
        _write_agent_data('plan_started')
        plan = self._plan_llm(query)
        steps = self._parse_plan_steps(plan)
        _write_agent_data('plan_finished', text=plan, steps=steps)
        return package([], '', steps, query)

    def _pre_action(self, pre_steps, response, steps, query):
        solver_prompt = SOLVER_PROMPT.format(
            previous_steps='\n'.join(pre_steps),
            current_task=steps[0],
            objective=query,
        )
        if self._return_last_tool_calls:
            solver_prompt += '\nIf no more tool calls are needed, reply with ok and skip any summary.'
        return package(solver_prompt, [])

    def _parse_plan_steps(self, plan: str) -> List[str]:
        return [step for step in re.split('\n\\s*\\d+\\. ', plan)[1:] if step]

    def _build_planner_prompt(self) -> str:
        tools_desc = []
        for name, tool in self._tools_manager.tools_info.items():
            desc = (tool.description or '').strip().splitlines()[0] if tool.description else ''
            tools_desc.append(f'- {name}: {desc}')
        tools_block = '\n'.join(tools_desc)
        if tools_block:
            return f'{PLANNER_PROMPT}\n\nAvailable tools:\n{tools_block}'
        return PLANNER_PROMPT

    def _post_action(self, pre_steps: List[str], response: str, steps: List[str], query: str):
        assert isinstance(response, str), f'After retrying \
            {self._max_retries} times, the solver still failes to call successfully.'
        LOG.debug(f'current step: {steps[0]}, response: {response}')
        current_res = f'- **SubTask**: {steps.pop(0)} **Response**: {response}'
        pre_steps.append(current_res)
        return package(pre_steps, response, steps, query)

    def _post_process(self, result):
        completed = self._pop_tool_calls()
        if completed is not None:
            return completed
        return result

`lazyllm.tools.ReWOOAgent`

Bases: LazyLLMAgentBase

ReWOOAgent consists of three parts: Planer, Worker and Solver. The Planner uses predictive reasoning capabilities to create a solution blueprint for a complex task; the Worker interacts with the environment through tool calls and fills in actual evidence or observations into instructions; the Solver processes all plans and evidence to develop a solution to the original task or problem.

Parameters:

llm (ModuleBase, default: None ) –

The LLM to be used can be TrainableModule or OnlineChatModule. It is mutually exclusive with plan_llm and solve_llm. Either set llm(the planner and solver share the same LLM), or set plan_llm and solve_llm,or only specify llm(to set the planner) and solve_llm. Other cases are considered invalid.
tools (List[str], default: [] ) –

A list of tool names for LLM to use.
plan_llm (ModuleBase, default: None ) –

The LLM to be used by the planner, which can be either TrainableModule or OnlineChatModule.
solve_llm (ModuleBase, default: None ) –

The LLM to be used by the solver, which can be either TrainableModule or OnlineChatModule.
return_trace (bool, default: False ) –

If True, return intermediate steps and tool calls.
stream (bool, default: False ) –

Whether to stream the planning and solving process.
return_last_tool_calls (bool, default: False ) –

If True, return the last tool-call trace when the model finishes.
skills (bool | str | List[str], default: None ) –

Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
desc (str, default: '' ) –

Optional agent capability description.
workspace (str, default: None ) –

Default agent workspace path. Defaults to config['home']/agent_workspace.

Examples:

>>> import lazyllm
>>> import wikipedia
>>> from lazyllm.tools import fc_register, ReWOOAgent
>>> @fc_register("tool")
>>> def WikipediaWorker(input: str):
...     '''
...     Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
...
...     Args:
...         input (str): search query.
...     '''
...     try:
...         evidence = wikipedia.page(input).content
...         evidence = evidence.split("\\n\\n")[0]
...     except wikipedia.PageError:
...         evidence = f"Could not find [{input}]. Similar: {wikipedia.search(input)}"
...     except wikipedia.DisambiguationError:
...         evidence = f"Could not find [{input}]. Similar: {wikipedia.search(input)}"
...     return evidence
...
>>> @fc_register("tool")
>>> def LLMWorker(input: str):
...     '''
...     A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.
...
...     Args:
...         input (str): instruction
...     '''
...     llm = lazyllm.OnlineChatModule(source="glm")
...     query = f"Respond in short directly with no extra words.\\n\\n{input}"
...     response = llm(query, llm_chat_history=[])
...     return response
...
>>> tools = ["WikipediaWorker", "LLMWorker"]
>>> llm = lazyllm.TrainableModule("GLM-4-9B-Chat").deploy_method(lazyllm.deploy.vllm).start()  # or llm = lazyllm.OnlineChatModule(source="sensenova")
>>> agent = ReWOOAgent(llm, tools)
>>> query = "What is the name of the cognac house that makes the main ingredient in The Hennchata?"
>>> res = agent(query)
>>> print(res)
'
Hennessy '

Source code in lazyllm/tools/agent/rewooAgent.py

class ReWOOAgent(LazyLLMAgentBase):
    """ReWOOAgent consists of three parts: Planer, Worker and Solver. The Planner uses predictive reasoning capabilities to create a solution blueprint for a complex task; the Worker interacts with the environment through tool calls and fills in actual evidence or observations into instructions; the Solver processes all plans and evidence to develop a solution to the original task or problem.

Args:
    llm (ModuleBase): The LLM to be used can be TrainableModule or OnlineChatModule. It is mutually exclusive with plan_llm and solve_llm. Either set llm(the planner and solver share the same LLM), or set plan_llm and solve_llm,or only specify llm(to set the planner) and solve_llm. Other cases are considered invalid.
    tools (List[str]): A list of tool names for LLM to use.
    plan_llm (ModuleBase): The LLM to be used by the planner, which can be either TrainableModule or OnlineChatModule.
    solve_llm (ModuleBase): The LLM to be used by the solver, which can be either TrainableModule or OnlineChatModule.
    return_trace (bool): If True, return intermediate steps and tool calls.
    stream (bool): Whether to stream the planning and solving process.
    return_last_tool_calls (bool): If True, return the last tool-call trace when the model finishes.
    skills (bool | str | List[str]): Skills config. True enables Skills with auto selection; pass a str/list to enable specific skills.
    desc (str): Optional agent capability description.
    workspace (str): Default agent workspace path. Defaults to `config['home']/agent_workspace`.


Examples:
    >>> import lazyllm
    >>> import wikipedia
    >>> from lazyllm.tools import fc_register, ReWOOAgent
    >>> @fc_register("tool")
    >>> def WikipediaWorker(input: str):
    ...     '''
    ...     Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.
    ...
    ...     Args:
    ...         input (str): search query.
    ...     '''
    ...     try:
    ...         evidence = wikipedia.page(input).content
    ...         evidence = evidence.split("\\\\n\\\\n")[0]
    ...     except wikipedia.PageError:
    ...         evidence = f"Could not find [{input}]. Similar: {wikipedia.search(input)}"
    ...     except wikipedia.DisambiguationError:
    ...         evidence = f"Could not find [{input}]. Similar: {wikipedia.search(input)}"
    ...     return evidence
    ...
    >>> @fc_register("tool")
    >>> def LLMWorker(input: str):
    ...     '''
    ...     A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction.
    ...
    ...     Args:
    ...         input (str): instruction
    ...     '''
    ...     llm = lazyllm.OnlineChatModule(source="glm")
    ...     query = f"Respond in short directly with no extra words.\\\\n\\\\n{input}"
    ...     response = llm(query, llm_chat_history=[])
    ...     return response
    ...
    >>> tools = ["WikipediaWorker", "LLMWorker"]
    >>> llm = lazyllm.TrainableModule("GLM-4-9B-Chat").deploy_method(lazyllm.deploy.vllm).start()  # or llm = lazyllm.OnlineChatModule(source="sensenova")
    >>> agent = ReWOOAgent(llm, tools)
    >>> query = "What is the name of the cognac house that makes the main ingredient in The Hennchata?"
    >>> res = agent(query)
    >>> print(res)
    '
    Hennessy '
    """
    def __init__(self, llm: Union[ModuleBase, None] = None, tools: List[Union[str, Callable]] = [], *,  # noqa B006
                 plan_llm: Union[ModuleBase, None] = None, solve_llm: Union[ModuleBase, None] = None,
                 return_trace: bool = False, stream: bool = False, return_last_tool_calls: bool = False,
                 skills: Union[bool, str, List[str], None] = None, desc: str = '',
                 workspace: Optional[str] = None, sandbox: Optional[LazyLLMSandboxBase] = None,
                 fs: Optional[Any] = None, skills_dir: Optional[str] = None,
                 enable_builtin_tools: bool = True):
        super().__init__(llm=llm, tools=tools, return_trace=return_trace, stream=stream,
                         return_last_tool_calls=return_last_tool_calls, skills=skills, desc=desc,
                         workspace=workspace, sandbox=sandbox, fs=fs, skills_dir=skills_dir,
                         enable_builtin_tools=enable_builtin_tools)
        if llm is None and plan_llm is None and solve_llm is None:
            raise ValueError('Either specify llm, or provide plan_llm/solve_llm.')
        if llm is None:
            if plan_llm is None:
                plan_llm = solve_llm
            if solve_llm is None:
                solve_llm = plan_llm
        else:
            if plan_llm is None:
                plan_llm = llm
            if solve_llm is None:
                solve_llm = llm
        self._assert_tools()
        skills_prompt = self._skill_manager.build_prompt() if self._skill_manager else ''
        planner_prompt = self._build_planner_prompt_template()
        solver_prompt = self._append_workspace_prompt(S_PROMPT_TEMPLATE)
        self._planner = plan_llm.share(
            prompt=ChatPrompter(instruction={'system': planner_prompt, 'user': ''}, skills=skills_prompt),
            stream=dict(prefix='\nI will give a plan first:\n', prefix_color=Color.blue, color=Color.green)
            if stream else False
        )
        self._solver = solve_llm.share(
            prompt=ChatPrompter(instruction={'system': solver_prompt, 'user': ''}, skills=skills_prompt),
            stream=dict(prefix='\nI will solve the problem:\n', prefix_color=Color.blue, color=Color.green)
            if stream else False
        )

    @once_wrapper(reset_on_pickle=True)
    def build_agent(self):
        with pipeline() as agent:
            agent.planner_pre_action = self._build_planner_input
            agent.planner = self._planner
            agent.worker_evidences = self._get_worker_evidences
            agent.solver_pre_action = self._build_solver_input | bind(input=agent.input)
            agent.solver = ifs(self._return_last_tool_calls, lambda x: 'ok', self._solver)
        self._agent = agent

    def _build_planner_prompt_template(self):
        prompt = P_PROMPT_PREFIX + 'Tools can be one of the following:\n'
        for name, tool in self._tools_manager.tools_info.items():
            prompt += f'{name}[params_dict]: {tool.description}\n'
        prompt += P_FEWSHOT + '\n' + P_PROMPT_SUFFIX
        return prompt

    def _build_planner_input(self, input: str):
        _write_agent_data('plan_started')
        locals['chat_history'][self._planner._module_id] = []
        return input

    def _parse_and_call_tool(self, tool_call: str, evidence: Dict[str, str]):
        tool_name, tool_arguments = tool_call.split('[', 1)
        tool_arguments = tool_arguments.split(']')[0]
        for var in re.findall(r'#E\d+', tool_arguments):
            if var in evidence:
                tool_arguments = tool_arguments.replace(var, str(evidence[var]))
        tool_calls = [{'function': {'name': tool_name, 'arguments': tool_arguments}}]
        if self._stream:
            _write_agent_data('tool_calls', tool_calls=tool_calls)
        result = self._tools_manager(tool_calls)
        if self._stream:
            _write_agent_data('tool_results',
                              tool_results=self._normalize_tool_results(tool_calls, result))
        locals['_lazyllm_agent']['workspace']['tool_call_trace'].append(
            {**tool_calls[0], 'tool_call_result': result[0]}
        )
        return json.dumps(result[0]).strip('\"')

    def _get_worker_evidences(self, response: str):
        _write_agent_data('plan_finished', text=response)
        LOG.debug(f'planner plans: {response}')
        evidence = {}
        worker_evidences = ''
        for line in response.splitlines():
            if line.startswith('Plan'):
                worker_evidences += line + '\n'
            elif re.match(r'#E\d+\s*=', line.strip()):
                e, tool_call = line.split('=', 1)
                evidence[e.strip()] = self._parse_and_call_tool(tool_call.strip(), evidence)
                worker_evidences += f'Evidence:\n{evidence[e.strip()]}\n'

        LOG.debug(f'worker_evidences: {worker_evidences}')
        return worker_evidences

    def _build_solver_input(self, worker_evidences, input):
        locals['chat_history'][self._solver._module_id] = []
        return {'input': input, 'objective': input, 'worker_evidences': worker_evidences}

    def _pre_process(self, query: str):
        locals['_lazyllm_agent']['workspace'] = {'tool_call_trace': []}
        return query

    def _post_process(self, result):
        trace = self._pop_tool_calls()
        if trace is not None:
            return trace
        return result

`lazyllm.tools.rag.smart_embedding_index.SmartEmbeddingIndex`

Bases: IndexBase

Source code in lazyllm/tools/rag/smart_embedding_index.py

class SmartEmbeddingIndex(IndexBase):
    def __init__(self, backend_type: str, **kwargs):
        if backend_type == 'milvus':
            self._store = MilvusStore(**kwargs)
        elif backend_type == 'map':
            self._store = MapStore(**kwargs)
        else:
            raise ValueError(f'unsupported backend [{backend_type}]')

    @override
    def update(self, nodes: List[DocNode]) -> None:
        self._store.update_nodes(nodes)

    @override
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        self._store.remove_nodes(uids=uids)

    @override
    def query(self, *args, **kwargs) -> List[DocNode]:
        return self._store.query(*args, **kwargs)

`lazyllm.tools.rag.doc_node.ImageDocNode`

Bases: DocNode

A specialized document node for handling image content in RAG systems.

ImageDocNode extends DocNode to provide specialized functionality for image processing and embedding generation. It automatically handles image loading, base64 encoding for embedding, and PIL Image objects for LLM processing.

Parameters:

image_path (str) –

The file path to the image file. This should be a valid path to an image file (e.g., .jpg, .png, .jpeg).
uid (Optional[str], default: None ) –

Unique identifier for the document node. If not provided, a UUID will be automatically generated.
group (Optional[str], default: None ) –

The group name this node belongs to. Used for organizing and filtering nodes.
embedding (Optional[Dict[str, List[float]]], default: None ) –

Pre-computed embeddings for the image. Keys are embedding model names, values are embedding vectors.
parent (Optional[DocNode], default: None ) –

Parent node in the document hierarchy. Used for building document trees.
metadata (Optional[Dict[str, Any]], default: None ) –

Additional metadata associated with the image node.
global_metadata (Optional[Dict[str, Any]], default: None ) –

Global metadata that applies to all nodes in the document.
text (Optional[str], default: None ) –

Optional text description or caption for the image.

Examples:

>>> from lazyllm.tools.rag.doc_node import ImageDocNode, MetadataMode
>>> import numpy as np
>>> image_node = ImageDocNode(
...     image_path="/home/mnt/yehongfei/Code/Test/framework.jpg",
...     text="这是一张照片"
)
>>> def clip_emb(content, modality="image"):
...     if modality == "image":
...         return [np.random.rand(512).tolist()]
...     return [np.random.rand(256).tolist()]
>>> embed_functions = {"clip": clip_emb}
>>> image_node.do_embedding(embed_functions)
>>> print(f"嵌入维度: {len(image_node.embedding['clip'])}")
>>> text_representation = image_node.get_text()
>>> content_representation = image_node.get_content(MetadataMode.EMBED)
>>> print(f"text属性: {text_representation}")
>>> print(f"content属性: {content_representation}")

Source code in lazyllm/tools/rag/doc_node.py

class ImageDocNode(DocNode):
    """A specialized document node for handling image content in RAG systems.

ImageDocNode extends DocNode to provide specialized functionality for image processing and embedding generation. It automatically handles image loading, base64 encoding for embedding, and PIL Image objects for LLM processing.

Args:
    image_path (str): The file path to the image file. This should be a valid path to an image file (e.g., .jpg, .png, .jpeg).
    uid (Optional[str]): Unique identifier for the document node. If not provided, a UUID will be automatically generated.
    group (Optional[str]): The group name this node belongs to. Used for organizing and filtering nodes.
    embedding (Optional[Dict[str, List[float]]]): Pre-computed embeddings for the image. Keys are embedding model names, values are embedding vectors.
    parent (Optional[DocNode]): Parent node in the document hierarchy. Used for building document trees.
    metadata (Optional[Dict[str, Any]]): Additional metadata associated with the image node.
    global_metadata (Optional[Dict[str, Any]]): Global metadata that applies to all nodes in the document.
    text (Optional[str]): Optional text description or caption for the image.


Examples:
    >>> from lazyllm.tools.rag.doc_node import ImageDocNode, MetadataMode
    >>> import numpy as np
    >>> image_node = ImageDocNode(
    ...     image_path="/home/mnt/yehongfei/Code/Test/framework.jpg",
    ...     text="这是一张照片"
    )
    >>> def clip_emb(content, modality="image"):
    ...     if modality == "image":
    ...         return [np.random.rand(512).tolist()]
    ...     return [np.random.rand(256).tolist()]
    >>> embed_functions = {"clip": clip_emb}
    >>> image_node.do_embedding(embed_functions)
    >>> print(f"嵌入维度: {len(image_node.embedding['clip'])}")
    >>> text_representation = image_node.get_text()
    >>> content_representation = image_node.get_content(MetadataMode.EMBED)
    >>> print(f"text属性: {text_representation}")
    >>> print(f"content属性: {content_representation}")
    """
    def __init__(self, image_path: str, uid: Optional[str] = None, group: Optional[str] = None,
                 embedding: Optional[Dict[str, List[float]]] = None, parent: Optional['DocNode'] = None,
                 metadata: Optional[Dict[str, Any]] = None, global_metadata: Optional[Dict[str, Any]] = None,
                 *, text: Optional[str] = None):
        super().__init__(uid, None, group, embedding, parent, metadata=metadata,
                         global_metadata=global_metadata, text=text)
        self._image_path = image_path.strip()
        self._modality = 'image'

    def do_embedding(self, embed: Dict[str, Callable]) -> None:
        """Generate embeddings for the image using the provided embedding functions.

This method overrides the parent class method to handle image-specific embedding generation. It automatically converts the image to the appropriate format (base64 for embedding) and calls the embedding functions with the image modality.

Args:
    embed (Dict[str, Callable]): Dictionary of embedding functions. Keys are embedding model names, values are callable functions that accept (content, modality) and return embedding vectors.
"""
        content = self.get_content(MetadataMode.EMBED)
        generate_embed = {}
        for k, e in embed.items():
            emb = e(content, modality=self._modality)
            generate_embed[k] = emb[0] if emb and not isinstance(emb[0], (int, float)) else emb

        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding = {**self.embedding, **generate_embed}

    def get_content(self, metadata_mode=MetadataMode.LLM) -> str:
        """Get the image content in different formats based on the metadata mode.

This method returns the image content in different formats depending on the intended use case. For LLM processing, it returns a PIL Image object. For embedding generation, it returns a base64-encoded image string.

Args:
    metadata_mode (MetadataMode, optional): The mode for content retrieval. Defaults to MetadataMode.LLM.
        - MetadataMode.LLM: Returns PIL Image object for LLM processing
        - MetadataMode.EMBED: Returns base64-encoded image for embedding generation
        - Other modes: Returns the image path as text

**Returns:**

- Union[PIL.Image.Image, List[str], str]: The image content in the requested format.
"""
        if metadata_mode == MetadataMode.LLM:
            return PIL.Image.open(self._image_path)
        elif metadata_mode == MetadataMode.EMBED:
            image_base64, mime = _image_to_base64(self._image_path)
            return [f'data:{mime};base64,{image_base64}']
        else:
            return self.get_text()

    @property
    def image_path(self):
        return self._image_path

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:  # Disable access to self._content
        """Get the image path as text representation.

This method overrides the parent class method to return the image path instead of the content field, since ImageDocNode doesn't use the content field for storing text.

**Returns:**

- str: The image file path.
"""
        return self._image_path

    @property
    def text(self) -> str:  # Disable access to self._content
        return self._image_path

`do_embedding(embed)`

Generate embeddings for the image using the provided embedding functions.

This method overrides the parent class method to handle image-specific embedding generation. It automatically converts the image to the appropriate format (base64 for embedding) and calls the embedding functions with the image modality.

Parameters:

embed (Dict[str, Callable]) –

Dictionary of embedding functions. Keys are embedding model names, values are callable functions that accept (content, modality) and return embedding vectors.

Source code in lazyllm/tools/rag/doc_node.py

    def do_embedding(self, embed: Dict[str, Callable]) -> None:
        """Generate embeddings for the image using the provided embedding functions.

This method overrides the parent class method to handle image-specific embedding generation. It automatically converts the image to the appropriate format (base64 for embedding) and calls the embedding functions with the image modality.

Args:
    embed (Dict[str, Callable]): Dictionary of embedding functions. Keys are embedding model names, values are callable functions that accept (content, modality) and return embedding vectors.
"""
        content = self.get_content(MetadataMode.EMBED)
        generate_embed = {}
        for k, e in embed.items():
            emb = e(content, modality=self._modality)
            generate_embed[k] = emb[0] if emb and not isinstance(emb[0], (int, float)) else emb

        with self._lock:
            self.embedding = self.embedding or {}
            self.embedding = {**self.embedding, **generate_embed}

`get_content(metadata_mode=MetadataMode.LLM)`

Get the image content in different formats based on the metadata mode.

This method returns the image content in different formats depending on the intended use case. For LLM processing, it returns a PIL Image object. For embedding generation, it returns a base64-encoded image string.

Parameters:

metadata_mode (MetadataMode, default: LLM ) –

The mode for content retrieval. Defaults to MetadataMode.LLM. - MetadataMode.LLM: Returns PIL Image object for LLM processing - MetadataMode.EMBED: Returns base64-encoded image for embedding generation - Other modes: Returns the image path as text

Returns:

Union[PIL.Image.Image, List[str], str]: The image content in the requested format.

Source code in lazyllm/tools/rag/doc_node.py

    def get_content(self, metadata_mode=MetadataMode.LLM) -> str:
        """Get the image content in different formats based on the metadata mode.

This method returns the image content in different formats depending on the intended use case. For LLM processing, it returns a PIL Image object. For embedding generation, it returns a base64-encoded image string.

Args:
    metadata_mode (MetadataMode, optional): The mode for content retrieval. Defaults to MetadataMode.LLM.
        - MetadataMode.LLM: Returns PIL Image object for LLM processing
        - MetadataMode.EMBED: Returns base64-encoded image for embedding generation
        - Other modes: Returns the image path as text

**Returns:**

- Union[PIL.Image.Image, List[str], str]: The image content in the requested format.
"""
        if metadata_mode == MetadataMode.LLM:
            return PIL.Image.open(self._image_path)
        elif metadata_mode == MetadataMode.EMBED:
            image_base64, mime = _image_to_base64(self._image_path)
            return [f'data:{mime};base64,{image_base64}']
        else:
            return self.get_text()

`get_text(metadata_mode=MetadataMode.NONE)`

Get the image path as text representation.

This method overrides the parent class method to return the image path instead of the content field, since ImageDocNode doesn't use the content field for storing text.

Returns:

str: The image file path.

Source code in lazyllm/tools/rag/doc_node.py

    def get_text(self, metadata_mode: MetadataMode = MetadataMode.NONE) -> str:  # Disable access to self._content
        """Get the image path as text representation.

This method overrides the parent class method to return the image path instead of the content field, since ImageDocNode doesn't use the content field for storing text.

**Returns:**

- str: The image file path.
"""
        return self._image_path

`lazyllm.tools.rag.store.hybrid.MapStore`

Bases: LazyLLMStoreBase

SQLite-based Map storage class, inherits from LazyLLMStoreBase.

Provides data persistence and BM25 full-text search via SQLite for lightweight use cases.

Parameters:

uri (Optional[str], default: None ) –

SQLite database file path, defaults to None (in-memory mode)
**kwargs –

Other keyword arguments

Attributes:

capability –

Storage capability flag, supports all operations
need_embedding –

Whether embedding is required
supports_index_registration –

Whether index registration is supported

Source code in lazyllm/tools/rag/store/hybrid/map_store.py

class MapStore(LazyLLMStoreBase):
    """SQLite-based Map storage class, inherits from LazyLLMStoreBase.

Provides data persistence and BM25 full-text search via SQLite for lightweight use cases.

Args:
    uri (Optional[str]): SQLite database file path, defaults to None (in-memory mode)
    **kwargs: Other keyword arguments

Attributes:
    capability: Storage capability flag, supports all operations
    need_embedding: Whether embedding is required
    supports_index_registration: Whether index registration is supported
"""
    capability = StoreCapability.ALL
    need_embedding = True
    supports_index_registration = True

    def __init__(self, uri: Optional[str] = None, **kwargs):
        self._uri = uri  # filepath to SQLite .db for persistence
        self._sqlite_first = bool(uri)
        self._conn = None
        self._sqlite_has_json = None

    def _open_conn(self):
        if not self._uri: return None
        if self._conn: return self._conn

        conn = sqlite3.connect(self._uri, timeout=5.0, check_same_thread=False)
        cur = conn.cursor()
        cur.execute('PRAGMA journal_mode = WAL;')
        cur.execute('PRAGMA synchronous = NORMAL;')
        cur.execute('PRAGMA busy_timeout = 5000;')
        conn.commit()
        self._conn = conn
        return conn

    @property
    def dir(self):
        if not self._uri:
            return ''
        path = os.path.dirname(self._uri)
        return path if path.endswith(os.sep) else path + os.sep

    def _ensure_table(self, cursor: sqlite3.Cursor, table: str):
        cursor.execute(f'''
        CREATE TABLE IF NOT EXISTS {table} (
            uid TEXT PRIMARY KEY,
            doc_id TEXT,
            'group' TEXT,
            content TEXT,
            meta TEXT,
            global_meta TEXT,
            type INTEGER,
            number INTEGER,
            kb_id TEXT,
            excluded_embed_metadata_keys TEXT,
            excluded_llm_metadata_keys TEXT,
            parent TEXT,
            answer TEXT,
            image_keys TEXT
        )''')
        cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_{table}_parent ON {table}(parent)')
        cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_{table}_docid ON {table}(doc_id)')
        cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_{table}_kbid ON {table}(kb_id)')
        cursor.execute(f'CREATE INDEX IF NOT EXISTS idx_{table}_number ON {table}(number)')

    def _save_to_uri(self, collection_name: str, data: List[dict]):
        conn = self._open_conn()
        cur = conn.cursor()
        self._ensure_table(cur, collection_name)
        sql = f'''INSERT OR REPLACE INTO {collection_name} (
                uid, doc_id, \'group\', content,
                meta, global_meta, type, number, kb_id,\
                excluded_embed_metadata_keys, excluded_llm_metadata_keys,\
                parent, answer, image_keys)\
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'''
        params = []
        for item in data:
            params.append(self._serialize_data(item))
        cur.executemany(sql, params)
        conn.commit()
        affected_rows = cur.rowcount
        LOG.debug(f'[MapStore - _save_to_uri] Inserted {affected_rows} rows into {collection_name}')

    def _del_from_uri(self, collection_name: str, criteria: Optional[dict] = None):
        conn = self._open_conn()
        cur = conn.cursor()
        where, args = self._build_where(criteria)
        cur.execute(f'DELETE FROM {collection_name} {where}', args)
        conn.commit()
        affected_rows = cur.rowcount
        LOG.debug(f'[MapStore - delete] Deleted {affected_rows} rows from {collection_name}')

    @override
    def connect(self, collections: Optional[List[str]] = None, **kwargs):
        """Connect to SQLite database and load data.

Initialize storage connection, create necessary database tables and indexes, load existing data into memory.

Args:
    collections (Optional[List[str]]): List of collection names to connect
    **kwargs: Other connection parameters

Returns:
    None
"""
        self._uid2data: Dict[str, dict] = {}
        self._collection2uids: Dict[str, Set[str]] = defaultdict(set)
        self._col_doc_uids: Dict[str, Dict[str, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._col_kb_doc_uids: Dict[str, Dict[str, Dict[str, Set[str]]]] = defaultdict(
            lambda: defaultdict(lambda: defaultdict(set)))
        self._col_parent_uids: Dict[str, Dict[str, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._col_number_uids: Dict[str, Dict[int, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._lock = threading.Lock()
        if self._uri:
            db_path = Path(self._uri)
            if not db_path.exists():
                LOG.info(f'[MapStore] SQLite DB {self._uri} does not exist, creating...')
                db_path.parent.mkdir(parents=True, exist_ok=True)
            with self._lock:
                conn = self._open_conn()
                if collections:
                    cur = conn.cursor()
                    for c in collections:
                        self._ensure_table(cur, c)
                    conn.commit()
        return

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data.

Insert data into specified collection, update if exists, supports batch operations.

Args:
    collection_name (str): Collection name
    data (List[dict]): Data list to insert

Returns:
    bool: Whether operation succeeded
"""
        try:
            if self._sqlite_first:
                with self._lock:
                    self._save_to_uri(collection_name, data)
            for item in data:
                uid = item.get('uid')
                doc_id = item.get('doc_id')
                kb_id = item.get(RAG_KB_ID, DEFAULT_KB_ID)
                item['kb_id'] = kb_id
                assert uid and doc_id, '[MapStore - upsert] uid and doc_id are required'
                self._cache_segment(collection_name, item)
            return True
        except Exception as e:
            LOG.error(f'[MapStore - upsert] Error upserting data: {e}')
            return False

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data.

Delete data from specified collection based on criteria, supports batch deletion.

Args:
    collection_name (str): Collection name
    criteria (Optional[dict]): Delete criteria
    **kwargs: Other delete parameters

Returns:
    bool: Whether operation succeeded
"""
        try:
            def _remove_uid(uid: str, use_discard: bool) -> None:
                data = self._uid2data.pop(uid, None)
                if not data:
                    return
                kb_id = data.get(RAG_KB_ID, DEFAULT_KB_ID)
                doc_id = data.get('doc_id')
                if use_discard:
                    self._collection2uids[collection_name].discard(uid)
                    self._col_kb_doc_uids[collection_name][kb_id][doc_id].discard(uid)
                    self._col_doc_uids[collection_name][doc_id].discard(uid)
                    self._col_parent_uids[collection_name][data.get('parent')].discard(uid)
                    self._col_number_uids[collection_name][data.get('number')].discard(uid)
                else:
                    self._collection2uids[collection_name].discard(uid)
                    self._col_kb_doc_uids[collection_name][kb_id][doc_id].discard(uid)
                    self._col_doc_uids[collection_name][doc_id].discard(uid)
                    self._col_parent_uids[collection_name][data.get('parent')].discard(uid)
                    self._col_number_uids[collection_name][data.get('number')].discard(uid)

            if self._sqlite_first:
                with self._lock:
                    self._del_from_uri(collection_name, criteria)
                    need_delete = self._get_uids_by_criteria(collection_name, criteria)
                    for uid in need_delete:
                        _remove_uid(uid, use_discard=True)
                return True

            need_delete = self._get_uids_by_criteria(collection_name, criteria)
            for uid in need_delete:
                _remove_uid(uid, use_discard=False)
            return True
        except Exception as e:
            LOG.error(f'[MapStore - delete] Error deleting data: {e}')
            return False

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Query data.

Query data from specified collection based on criteria, supports multiple query conditions.

Args:
    collection_name (str): Collection name
    criteria (Optional[dict]): Query criteria
    **kwargs: Other query parameters

Returns:
    List[dict]: Query result data list
"""
        limit = kwargs.get('limit')
        offset = max(kwargs.get('offset', 0) or 0, 0)
        return_total = kwargs.get('return_total', False)
        sort_by_number = kwargs.get('sort_by_number', False)
        if self._sqlite_first:
            with self._lock:
                conn = self._open_conn()
                cur = conn.cursor()
                self._ensure_table(cur, collection_name)
                where, args = self._build_where(criteria)
                order_clause = ' ORDER BY number ASC, uid ASC' if sort_by_number else ''
                page_clause = ''
                page_args = ()
                if limit is not None:
                    page_clause = ' LIMIT ? OFFSET ?'
                    page_args = (limit, offset)
                elif offset > 0:
                    page_clause = ' LIMIT -1 OFFSET ?'
                    page_args = (offset,)
                total = None
                if return_total:
                    cur.execute(f'SELECT COUNT(*) FROM {collection_name}{where}', args)
                    total = cur.fetchone()[0]
                cur.execute(f'''SELECT uid, doc_id, "group", content, meta, global_meta, type, number, kb_id,
                                excluded_embed_metadata_keys, excluded_llm_metadata_keys, parent, answer, image_keys
                                FROM {collection_name}{where}{order_clause}{page_clause}''', args + page_args)
                rows = cur.fetchall()
                res = []
                for r in rows:
                    item = self._deserialize_data(r)
                    res.append(item)
            return (res, total) if return_total else res
        else:
            uids = self._get_uids_by_criteria(collection_name, criteria)
            res = [self._uid2data[uid] for uid in uids if uid in self._uid2data]
            if sort_by_number:
                res = sorted(res, key=lambda item: (item.get('number', 0), item.get('uid', '')))
            total = len(res)
            if offset > 0 or limit is not None:
                end = None if limit is None else offset + limit
                res = res[offset:end]
            return (res, total) if return_total else res

    def _build_where(self, criteria: dict):
        if not criteria:
            return '', ()
        clauses, args = [], []
        uids = criteria.get('uid')
        kb_id = criteria.get(RAG_KB_ID)
        doc_ids = criteria.get(RAG_DOC_ID)
        parents = criteria.get('parent')
        numbers = criteria.get('number')
        if uids:
            placeholders = ','.join('?' for _ in uids)
            clauses.append(f'uid IN ({placeholders})')
            args.extend(uids)
        if kb_id:
            clauses.append('kb_id = ?'); args.append(kb_id)
        if doc_ids:
            placeholders = ','.join('?' for _ in doc_ids)
            clauses.append(f'doc_id IN ({placeholders})')
            args.extend(doc_ids)
        if parents:
            placeholders = ','.join('?' for _ in parents)
            clauses.append(f'parent IN ({placeholders})')
            args.extend(parents)
        if numbers:
            placeholders = ','.join('?' for _ in numbers)
            clauses.append(f'number IN ({placeholders})')
            args.extend(numbers)
        where = (' WHERE ' + ' AND '.join(clauses)) if clauses else ''
        return where, tuple(args)

    def _cache_segment(self, collection_name: str, item: dict) -> None:
        uid = item.get('uid')
        if not uid:
            return
        self._uid2data[uid] = item
        self._collection2uids[collection_name].add(uid)
        self._col_doc_uids[collection_name][item['doc_id']].add(uid)
        self._col_kb_doc_uids[collection_name][item['kb_id']][item['doc_id']].add(uid)
        self._col_parent_uids[collection_name][item.get('parent')].add(uid)
        self._col_number_uids[collection_name][item['number']].add(item['uid'])

    def _check_sqlite_json(self, cursor: sqlite3.Cursor) -> bool:
        if self._sqlite_has_json is not None:
            return self._sqlite_has_json
        try:
            cursor.execute("SELECT json_extract('{}', '$')")
            cursor.fetchone()
            self._sqlite_has_json = True
        except sqlite3.OperationalError:
            self._sqlite_has_json = False
        return self._sqlite_has_json

    def _json_path(self, key: str) -> str:
        key = str(key).replace('"', '\\"')
        return f'$."{key}"'

    def _build_filter_where(self, filters: Dict[str, Union[str, int, List, Set]]):
        if not filters:
            return '', ()
        clauses, args = [], []
        for name, candidates in filters.items():
            path = self._json_path(name)
            if isinstance(candidates, (list, set, tuple)):
                values = list(candidates)
                if not values:
                    return ' WHERE 0', ()
                non_null = [v for v in values if v is not None]
                sub_clauses = []
                if non_null:
                    placeholders = ','.join('?' for _ in non_null)
                    sub_clauses.append(f'json_extract(global_meta, ?) IN ({placeholders})')
                    args.append(path)
                    args.extend(non_null)
                if len(non_null) != len(values):
                    sub_clauses.append('json_extract(global_meta, ?) IS NULL')
                    args.append(path)
                clause = sub_clauses[0] if len(sub_clauses) == 1 else '(' + ' OR '.join(sub_clauses) + ')'
                clauses.append(clause)
            else:
                if candidates is None:
                    clauses.append('json_extract(global_meta, ?) IS NULL')
                    args.append(path)
                else:
                    clauses.append('json_extract(global_meta, ?) = ?')
                    args.append(path)
                    args.append(candidates)
        where = ' WHERE ' + ' AND '.join(clauses) if clauses else ''
        return where, tuple(args)

    def _get_by_filters(self, collection_name: str,
                        filters: Dict[str, Union[str, int, List, Set]]) -> Optional[List[dict]]:
        if not filters:
            return None
        with self._lock:
            conn = self._open_conn()
            cur = conn.cursor()
            self._ensure_table(cur, collection_name)
            if not self._check_sqlite_json(cur):
                LOG.debug('[MapStore] SQLite JSON1 not available, fallback to in-memory filters')
                return None
            where, args = self._build_filter_where(filters)
            if where == ' WHERE 0':
                return []
            cur.execute(f'''SELECT uid, doc_id, "group", content, meta, global_meta, type, number, kb_id,
                            excluded_embed_metadata_keys, excluded_llm_metadata_keys, parent, answer, image_keys
                            FROM {collection_name}{where}''', args)
            rows = cur.fetchall()
            res = []
            for r in rows:
                item = self._deserialize_data(r)
                res.append(item)
            return res

    def _get_uids_by_criteria(self, collection_name: str, criteria: dict) -> List[str]:
        if not criteria:
            return list(self._collection2uids.get(collection_name, set()))
        else:
            uids = criteria.get('uid', [])
            kb_id = criteria.get(RAG_KB_ID)
            doc_ids = criteria.get(RAG_DOC_ID, [])
            parents = criteria.get('parent', [])
            numbers = criteria.get('number', [])

            base_uids = set(self._collection2uids.get(collection_name, set()))
            if uids:
                base_uids &= set(uids)

            if kb_id and doc_ids:
                kb_doc_uids = {
                    uid for doc_id in doc_ids
                    for uid in self._col_kb_doc_uids.get(collection_name, {}).get(kb_id, {}).get(doc_id, ())
                }
                base_uids &= kb_doc_uids
            elif kb_id:
                kb_doc_uids = {
                    uid for doc_id in self._col_kb_doc_uids.get(collection_name, {}).get(kb_id, {}).keys()
                    for uid in self._col_kb_doc_uids.get(collection_name, {}).get(kb_id, {}).get(doc_id, ())
                }
                base_uids &= kb_doc_uids
            elif doc_ids:
                doc_uids = {
                    uid for doc_id in doc_ids
                    for uid in self._col_doc_uids.get(collection_name, {}).get(doc_id, ())
                }
                base_uids &= doc_uids

            if parents:
                parent_uids = {
                    uid for parent in parents
                    for uid in self._col_parent_uids.get(collection_name, {}).get(parent, ())
                }
                base_uids &= parent_uids

            if numbers:
                number_uids = {
                    uid for number in numbers
                    for uid in self._col_number_uids.get(collection_name, {}).get(number, ())
                }
                base_uids &= number_uids

            if not base_uids and any([uids, kb_id, doc_ids, parents, numbers]):
                return []
            if not base_uids:
                raise ValueError(f'[MapStore - get] Invalid criteria: {criteria}')
            return list(base_uids)

    def _serialize_data(self, item: dict) -> tuple:
        kb_id = item.get(RAG_KB_ID, DEFAULT_KB_ID)
        return (item['uid'], item['doc_id'], item['group'], item.get('content', ''),
                json.dumps(item.get('meta', {})), json.dumps(item.get('global_meta', {})),
                item['type'], item['number'], kb_id,
                json.dumps(item.get('excluded_embed_metadata_keys', [])),
                json.dumps(item.get('excluded_llm_metadata_keys', [])),
                item.get('parent'), item.get('answer', ''), json.dumps(item.get('image_keys', [])))

    def _deserialize_data(self, row: tuple) -> dict:
        (uid, doc_id, group, content, meta_str, global_meta_str, type_, number, kb_id,
         excl_emb_str, excl_llm_str, parent, answer, image_keys_str) = row
        return {
            'uid': uid, 'doc_id': doc_id, 'group': group, 'content': content,
            'meta': json.loads(meta_str) if meta_str else {},
            'global_meta': json.loads(global_meta_str) if global_meta_str else {},
            'type': type_, 'number': number, 'kb_id': kb_id, 'parent': parent, 'answer': answer,
            'excluded_embed_metadata_keys': json.loads(excl_emb_str) if excl_emb_str else [],
            'excluded_llm_metadata_keys': json.loads(excl_llm_str) if excl_llm_str else [],
            'image_keys': json.loads(image_keys_str) if image_keys_str else []
        }

    @override
    def search(self, collection_name: str, query: Optional[str] = None,
               query_embedding: Optional[Union[dict, List[float]]] = None, topk: int = 10,
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        if query_embedding is not None:
            raise ValueError('MapStore only supports BM25 text search, query_embedding is not supported')
        if embed_key is not None:
            raise ValueError('MapStore only supports BM25 text search, embed_key is not supported')
        segments = None
        if self._sqlite_first and filters:
            segments = self._get_by_filters(collection_name, filters)
        if segments is None:
            segments = self.get(collection_name=collection_name, criteria=None)
            segments = self._apply_filters(segments, filters)
        if not query:
            return []
        language = kwargs.get('language', 'en')
        return self._search_by_text(collection_name, segments, query, topk, language, filters)

    def _apply_filters(self, segments: List[dict],
                       filters: Optional[Dict[str, Union[str, int, List, Set]]]) -> List[dict]:
        if not filters:
            return segments
        filtered = []
        for seg in segments:
            global_meta = seg.get('global_meta', {})
            for name, candidates in filters.items():
                value = global_meta.get(name)
                if isinstance(candidates, (list, set)):
                    if value not in candidates:
                        break
                else:
                    if value != candidates:
                        break
            else:
                filtered.append(seg)
        return filtered

    def _search_by_text(self, collection_name: str, segments: List[dict], query: str, topk: int,
                        language: str, filters: Optional[Dict[str, Union[str, int, List, Set]]]) -> List[dict]:
        nodes = []
        uid2segment = {}
        for seg in segments:
            content = seg.get('content', '')
            if not content:
                continue
            uid = seg.get('uid')
            node = DocNode(uid=uid, content=content, metadata=seg.get('meta', {}),
                           global_metadata=seg.get('global_meta', {}))
            nodes.append(node)
            if uid:
                uid2segment[uid] = seg
        if not nodes:
            return []
        topk = len(nodes) if topk is None else min(topk, len(nodes))
        if language == 'zh':
            results = bm25_chinese(query, nodes)
        else:
            results = bm25(query, nodes)
        results = results[:topk]
        scored = []
        for node, score in results:
            seg = uid2segment.get(node.uid)
            if not seg:
                continue
            item = dict(seg)
            item['score'] = float(score)
            scored.append(item)
        return scored

`connect(collections=None, **kwargs)`

Connect to SQLite database and load data.

Initialize storage connection, create necessary database tables and indexes, load existing data into memory.

Parameters:

collections (Optional[List[str]], default: None ) –

List of collection names to connect
**kwargs –

Other connection parameters

Returns:

–

None

Source code in lazyllm/tools/rag/store/hybrid/map_store.py

    @override
    def connect(self, collections: Optional[List[str]] = None, **kwargs):
        """Connect to SQLite database and load data.

Initialize storage connection, create necessary database tables and indexes, load existing data into memory.

Args:
    collections (Optional[List[str]]): List of collection names to connect
    **kwargs: Other connection parameters

Returns:
    None
"""
        self._uid2data: Dict[str, dict] = {}
        self._collection2uids: Dict[str, Set[str]] = defaultdict(set)
        self._col_doc_uids: Dict[str, Dict[str, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._col_kb_doc_uids: Dict[str, Dict[str, Dict[str, Set[str]]]] = defaultdict(
            lambda: defaultdict(lambda: defaultdict(set)))
        self._col_parent_uids: Dict[str, Dict[str, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._col_number_uids: Dict[str, Dict[int, Set[str]]] = defaultdict(lambda: defaultdict(set))
        self._lock = threading.Lock()
        if self._uri:
            db_path = Path(self._uri)
            if not db_path.exists():
                LOG.info(f'[MapStore] SQLite DB {self._uri} does not exist, creating...')
                db_path.parent.mkdir(parents=True, exist_ok=True)
            with self._lock:
                conn = self._open_conn()
                if collections:
                    cur = conn.cursor()
                    for c in collections:
                        self._ensure_table(cur, c)
                    conn.commit()
        return

`delete(collection_name, criteria=None, **kwargs)`

Delete data.

Delete data from specified collection based on criteria, supports batch deletion.

Parameters:

collection_name (str) –

Collection name
criteria (Optional[dict], default: None ) –

Delete criteria
**kwargs –

Other delete parameters

Returns:

bool ( bool ) –

Whether operation succeeded

Source code in lazyllm/tools/rag/store/hybrid/map_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data.

Delete data from specified collection based on criteria, supports batch deletion.

Args:
    collection_name (str): Collection name
    criteria (Optional[dict]): Delete criteria
    **kwargs: Other delete parameters

Returns:
    bool: Whether operation succeeded
"""
        try:
            def _remove_uid(uid: str, use_discard: bool) -> None:
                data = self._uid2data.pop(uid, None)
                if not data:
                    return
                kb_id = data.get(RAG_KB_ID, DEFAULT_KB_ID)
                doc_id = data.get('doc_id')
                if use_discard:
                    self._collection2uids[collection_name].discard(uid)
                    self._col_kb_doc_uids[collection_name][kb_id][doc_id].discard(uid)
                    self._col_doc_uids[collection_name][doc_id].discard(uid)
                    self._col_parent_uids[collection_name][data.get('parent')].discard(uid)
                    self._col_number_uids[collection_name][data.get('number')].discard(uid)
                else:
                    self._collection2uids[collection_name].discard(uid)
                    self._col_kb_doc_uids[collection_name][kb_id][doc_id].discard(uid)
                    self._col_doc_uids[collection_name][doc_id].discard(uid)
                    self._col_parent_uids[collection_name][data.get('parent')].discard(uid)
                    self._col_number_uids[collection_name][data.get('number')].discard(uid)

            if self._sqlite_first:
                with self._lock:
                    self._del_from_uri(collection_name, criteria)
                    need_delete = self._get_uids_by_criteria(collection_name, criteria)
                    for uid in need_delete:
                        _remove_uid(uid, use_discard=True)
                return True

            need_delete = self._get_uids_by_criteria(collection_name, criteria)
            for uid in need_delete:
                _remove_uid(uid, use_discard=False)
            return True
        except Exception as e:
            LOG.error(f'[MapStore - delete] Error deleting data: {e}')
            return False

`get(collection_name, criteria=None, **kwargs)`

Query data.

Query data from specified collection based on criteria, supports multiple query conditions.

Parameters:

collection_name (str) –

Collection name
criteria (Optional[dict], default: None ) –

Query criteria
**kwargs –

Other query parameters

Returns:

List[dict] –

List[dict]: Query result data list

Source code in lazyllm/tools/rag/store/hybrid/map_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:
        """Query data.

Query data from specified collection based on criteria, supports multiple query conditions.

Args:
    collection_name (str): Collection name
    criteria (Optional[dict]): Query criteria
    **kwargs: Other query parameters

Returns:
    List[dict]: Query result data list
"""
        limit = kwargs.get('limit')
        offset = max(kwargs.get('offset', 0) or 0, 0)
        return_total = kwargs.get('return_total', False)
        sort_by_number = kwargs.get('sort_by_number', False)
        if self._sqlite_first:
            with self._lock:
                conn = self._open_conn()
                cur = conn.cursor()
                self._ensure_table(cur, collection_name)
                where, args = self._build_where(criteria)
                order_clause = ' ORDER BY number ASC, uid ASC' if sort_by_number else ''
                page_clause = ''
                page_args = ()
                if limit is not None:
                    page_clause = ' LIMIT ? OFFSET ?'
                    page_args = (limit, offset)
                elif offset > 0:
                    page_clause = ' LIMIT -1 OFFSET ?'
                    page_args = (offset,)
                total = None
                if return_total:
                    cur.execute(f'SELECT COUNT(*) FROM {collection_name}{where}', args)
                    total = cur.fetchone()[0]
                cur.execute(f'''SELECT uid, doc_id, "group", content, meta, global_meta, type, number, kb_id,
                                excluded_embed_metadata_keys, excluded_llm_metadata_keys, parent, answer, image_keys
                                FROM {collection_name}{where}{order_clause}{page_clause}''', args + page_args)
                rows = cur.fetchall()
                res = []
                for r in rows:
                    item = self._deserialize_data(r)
                    res.append(item)
            return (res, total) if return_total else res
        else:
            uids = self._get_uids_by_criteria(collection_name, criteria)
            res = [self._uid2data[uid] for uid in uids if uid in self._uid2data]
            if sort_by_number:
                res = sorted(res, key=lambda item: (item.get('number', 0), item.get('uid', '')))
            total = len(res)
            if offset > 0 or limit is not None:
                end = None if limit is None else offset + limit
                res = res[offset:end]
            return (res, total) if return_total else res

`upsert(collection_name, data)`

Insert or update data.

Insert data into specified collection, update if exists, supports batch operations.

Parameters:

collection_name (str) –

Collection name
data (List[dict]) –

Data list to insert

Returns:

bool ( bool ) –

Whether operation succeeded

Source code in lazyllm/tools/rag/store/hybrid/map_store.py

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data.

Insert data into specified collection, update if exists, supports batch operations.

Args:
    collection_name (str): Collection name
    data (List[dict]): Data list to insert

Returns:
    bool: Whether operation succeeded
"""
        try:
            if self._sqlite_first:
                with self._lock:
                    self._save_to_uri(collection_name, data)
            for item in data:
                uid = item.get('uid')
                doc_id = item.get('doc_id')
                kb_id = item.get(RAG_KB_ID, DEFAULT_KB_ID)
                item['kb_id'] = kb_id
                assert uid and doc_id, '[MapStore - upsert] uid and doc_id are required'
                self._cache_segment(collection_name, item)
            return True
        except Exception as e:
            LOG.error(f'[MapStore - upsert] Error upserting data: {e}')
            return False

`lazyllm.tools.rag.store.segment.opensearch_store.OpenSearchStore`

Bases: LazyLLMStoreBase

OpenSearch storage class, inherits from LazyLLMStoreBase.

Provides document storage and retrieval functionality based on OpenSearch, supports large-scale document management and efficient query.

Parameters:

uris (List[str]) –

OpenSearch service URI list
client_kwargs (Optional[Dict], default: None ) –

OpenSearch client configuration parameters
index_kwargs (Optional[Union[Dict, List]], default: None ) –

Index configuration parameters
**kwargs –

Other keyword arguments

Attributes:

capability –

Storage capability flag, supports segment operations
need_embedding –

Whether embedding is needed
supports_index_registration –

Whether index registration is supported

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

class OpenSearchStore(LazyLLMStoreBase):
    """OpenSearch storage class, inherits from LazyLLMStoreBase.

Provides document storage and retrieval functionality based on OpenSearch, supports large-scale document management and efficient query.

Args:
    uris (List[str]): OpenSearch service URI list
    client_kwargs (Optional[Dict]): OpenSearch client configuration parameters
    index_kwargs (Optional[Union[Dict, List]]): Index configuration parameters
    **kwargs: Other keyword arguments

Attributes:
    capability: Storage capability flag, supports segment operations
    need_embedding: Whether embedding is needed
    supports_index_registration: Whether index registration is supported


"""
    capability = StoreCapability.SEGMENT
    need_embedding = False
    supports_index_registration = False

    def __init__(self, uris: List[str], client_kwargs: Optional[Dict] = None,
                 index_kwargs: Optional[Union[Dict, List]] = None, **kwargs):
        if isinstance(uris, str):
            uris = [uris]
        self._uris = uris
        self._client_kwargs = client_kwargs or {}
        self._index_kwargs = index_kwargs or DEFAULT_MAPPING_BODY
        self._primary_key = 'uid'

    @property
    def dir(self):
        return None

    @override
    def connect(self, global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs) -> None:
        """Connect to OpenSearch service.

Initialize OpenSearch client connection, configure authentication information.

Args:
    *args: Positional arguments
    **kwargs: Keyword arguments

Returns:
    None
"""
        if self._client_kwargs.get('user') and self._client_kwargs.get('password'):
            self._client_kwargs['http_auth'] = (self._client_kwargs.pop('user'), self._client_kwargs.pop('password'))
        self._ddl_lock = threading.Lock()
        self._client = opensearchpy.OpenSearch(hosts=self._uris, **self._client_kwargs)
        self._global_metadata_desc = global_metadata_desc or {}
        self._index_kwargs = self._adapt_mapping_for_global_metadata()

    def _ensure_index(self, name: str):
        if self._client.indices.exists(index=name):
            return
        with self._ddl_lock:
            if self._client.indices.exists(index=name):
                return
            try:
                self._client.indices.create(index=name, body=self._index_kwargs)
            except opensearchpy.TransportError as e:
                if getattr(e, 'error', '') != 'resource_already_exists_exception':
                    raise e
            except Exception as e:
                LOG.error(f'[OpenSearchStore - _ensure_index] Error creating index {name}: {e}')
                raise e

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data to OpenSearch.

Batch insert or update document data to specified collection (index), supports automatic index creation.

Args:
    collection_name (str): Collection name (index name)
    data (List[dict]): Data list to insert

Returns:
    bool: Whether operation succeeded
"""
        if not data: return
        try:
            self._ensure_index(collection_name)
            for i in range(0, len(data), INSERT_BATCH_SIZE):
                bulk_data = []
                batch_data = data[i:i + INSERT_BATCH_SIZE]
                for segment in batch_data:
                    segment = self._serialize_node(segment)
                    bulk_data.append({'index': {'_index': collection_name, '_id': segment.get(self._primary_key)}})
                    bulk_data.append(segment)
                response = self._client.bulk(index=collection_name, body=bulk_data, refresh='wait_for')
                if response.get('errors'):
                    error_details = []
                    for item in (response.get('items') or []):
                        idx_err = (item.get('index') or {}).get('error')
                        if idx_err:
                            error_details.append(f'{idx_err.get("type", "unknown")}: {idx_err.get("reason", "?")}')
                    detail = '; '.join(error_details[:5]) if error_details else str(response.get('errors'))
                    raise ValueError(
                        f'Error upserting data to OpenSearch: {len(error_details) or "?"} error(s) in batch'
                        f'{f" — {detail}" if detail else ""}')
            return True
        except Exception as e:
            LOG.error(f'[OpenSearchStore - upsert] Error upserting data to OpenSearch: {e}')
            return False

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from OpenSearch.

Delete data from specified collection based on criteria, supports batch deletion and index deletion.

Args:
    collection_name (str): Collection name (index name)
    criteria (Optional[dict]): Delete criteria
    **kwargs: Other delete parameters

Returns:
    bool: Whether operation succeeded
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[OpenSearchStore - delete] Index {collection_name} does not exist')
                return True
            if not criteria:
                with self._ddl_lock:
                    if self._client.indices.exists(index=collection_name):
                        self._client.indices.delete(index=collection_name)
                return True
            else:
                resp = self._client.delete_by_query(index=collection_name, body=self._construct_criteria(criteria),
                                                    refresh=True, conflicts='proceed')
                if resp.get('version_conflicts', 0) > 0:
                    LOG.warning(f'[OpenSearchStore - delete] Version conflicts: {resp.get("version_conflicts")}')
                if resp.get('failures'):
                    raise ValueError(f'Error deleting data from OpenSearch: {resp["failures"]}')
                return True
        except Exception as e:
            LOG.error(f'[OpenSearchStore - delete] Error deleting data from OpenSearch: {e}')
            return False

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """Query data from OpenSearch.

Query data from specified collection based on criteria, supports primary key query and complex condition query.

Args:
    collection_name (str): Collection name (index name)
    criteria (Optional[dict]): Query criteria
    **kwargs: Other query parameters

Returns:
    List[dict]: Query result data list
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[OpenSearchStore - get] Index {collection_name} does not exist')
                return []
            results: List[dict] = []
            criteria = dict(criteria) if criteria else {}
            limit = kwargs.get('limit')
            offset = max(kwargs.get('offset', 0) or 0, 0)
            return_total = kwargs.get('return_total', False)
            sort_by_number = kwargs.get('sort_by_number', False)
            if criteria and self._primary_key in criteria:
                vals = criteria.pop(self._primary_key)
                if not isinstance(vals, list):
                    vals = [vals]
                body = {'ids': vals}
                resp = self._client.mget(index=collection_name, body=body)
                for doc in resp['docs']:
                    if doc.get('found', False):
                        results.append(self._transform_segment(doc))
                if sort_by_number:
                    results = sorted(results, key=lambda item: (item.get('number', 0), item.get('uid', '')))
                total = len(results)
                if offset > 0 or limit is not None:
                    end = None if limit is None else offset + limit
                    results = results[offset:end]
                return (results, total) if return_total else results
            elif sort_by_number and (limit is not None or offset > 0 or return_total):
                body = self._construct_criteria(criteria) or {'query': {'match_all': {}}}
                body['sort'] = [{'number': {'order': 'asc'}}, {'_id': {'order': 'asc'}}]
                if offset > 0:
                    body['from'] = offset
                if limit is not None:
                    body['size'] = limit
                elif offset > 0:
                    body['size'] = 10000
                if return_total:
                    body['track_total_hits'] = True
                resp = self._client.search(index=collection_name, body=body)
                results = [self._transform_segment(hit) for hit in resp['hits']['hits']]
                total = resp['hits']['total']['value'] if return_total else len(results)
                return (results, total) if return_total else results
            else:
                spec = importlib.util.find_spec('opensearchpy.helpers')
                helpers = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(helpers)
                query = self._construct_criteria(criteria)
                for hit in helpers.scan(
                    client=self._client,
                    index=collection_name,
                    query=query,
                    scroll='2m',
                    size=500,
                    preserve_order=False
                ):
                    seg = self._transform_segment(hit)
                    if seg:
                        results.append(seg)
            return results
        except Exception as e:
            LOG.error(f'[OpenSearchStore - get] Error getting data from OpenSearch: {e}')
            return []

    @override
    def search(self, collection_name: str, query: Optional[str] = None,
               topk: Optional[int] = 10, filters: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """Perform vector similarity search with optional metadata filtering.
Args:
    collection_name (str): Collection to search.
    query (Optional[str]): Query string.
    topk (Optional[int]): Number of nearest neighbors.
    filters (Optional[dict]): Metadata filter map.
    kwargs: Other search parameters

**Returns:**

- List[dict]: Return matching results list and similarity 'score'.
"""
        query_fields = ['*']
        try:
            self._ensure_index(collection_name)
            must_clauses = []
            os_query = {}
            text_query = {
                'multi_match': {
                    'query': query,
                    'fields': query_fields,
                }
            }
            must_clauses.append(text_query)

            filter_query = self._construct_criteria(filters) if filters else {}

            if must_clauses and filter_query:
                filter_must = filter_query['query']['bool']['must']
                os_query = {'query': {'bool': {'must': must_clauses + filter_must}}}
            elif must_clauses:
                os_query = {'query': {'bool': {'must': must_clauses}}}
            elif filter_query:
                os_query = filter_query
            else:
                os_query = {'query': {'match_all': {}}}

            os_query['size'] = topk

            resp = self._client.search(index=collection_name, body=os_query)
            res = []
            for hit in resp['hits']['hits']:
                seg = self._transform_segment(hit)
                if seg:
                    seg['score'] = hit.get('_score', 0.0)
                    res.append(seg)
            return res

        except Exception as e:
            LOG.error(f'[OpenSearchStore - search] Error searching data from OpenSearch: {e}')
            return []

    def _serialize_node(self, segment: dict):
        seg = dict(segment)
        seg.pop('embedding', None)
        if self._global_metadata_desc and self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            seg['global_meta'] = json.dumps(seg.get('global_meta', {}), ensure_ascii=False)
            seg['meta'] = json.dumps(seg.get('meta', {}), ensure_ascii=False)
            seg['image_keys'] = json.dumps(seg.get('image_keys', []), ensure_ascii=False)
        return seg

    def _deserialize_node(self, segment: dict) -> dict:
        seg = dict(segment)
        if self._global_metadata_desc and self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            meta = seg.get('meta', '{}')
            global_meta = seg.get('global_meta', '{}')
            image_keys = seg.get('image_keys', '[]')
            if isinstance(meta, (str, bytes, bytearray)):
                seg['meta'] = json.loads(meta)
            if isinstance(global_meta, (str, bytes, bytearray)):
                seg['global_meta'] = json.loads(global_meta)
            if isinstance(image_keys, (str, bytes, bytearray)):
                seg['image_keys'] = json.loads(image_keys)
        return seg

    def _construct_criteria(self, criteria: Optional[dict] = None) -> dict:  # noqa: C901
        criteria = dict(criteria) if criteria else {}
        if not criteria:
            return {}
        if self._primary_key in criteria:
            vals = criteria.pop(self._primary_key)
            if not isinstance(vals, list):
                vals = [vals]
            return {'query': {'ids': {'values': vals}}}

        exact_match_fields = {'doc_id', 'kb_id', 'group', 'parent'}

        def _add_clause(key, val):
            if key in exact_match_fields:
                clauses = []
                if isinstance(val, list):
                    clauses.append({'terms': {key: val}})
                    clauses.append({'terms': {f'{key}.keyword': val}})
                else:
                    clauses.append({'term': {key: val}})
                    clauses.append({'term': {f'{key}.keyword': val}})
                must_clauses.append({'bool': {'should': clauses, 'minimum_should_match': 1}})
                return
            if isinstance(val, list):
                must_clauses.append({'terms': {key: val}})
            else:
                must_clauses.append({'term': {key: val}})
        must_clauses = []
        if RAG_DOC_ID in criteria:
            val = criteria.pop(RAG_DOC_ID)
            _add_clause('doc_id', val)
        if RAG_KB_ID in criteria:
            val = criteria.pop(RAG_KB_ID)
            _add_clause('kb_id', val)
        if 'parent' in criteria:
            must_clauses.append({'term': {'parent': criteria.pop('parent')}})
        if 'number' in criteria:
            must_clauses.append({'term': {'number': criteria.pop('number')}})

        for k, v in criteria.items():
            field_key = k
            if (self._global_metadata_desc
                and self._global_metadata_desc != BUILDIN_GLOBAL_META_DESC
                and k in self._global_metadata_desc.keys()
            ):
                field_desc = self._global_metadata_desc[k]

                if field_desc.data_type in (DataType.VARCHAR, DataType.STRING):
                    field_key = f'{k}.keyword'
            _add_clause(field_key, v)

        return {'query': {'bool': {'must': must_clauses}}} if must_clauses else {}

    def _transform_segment(self, record: dict) -> dict:
        src = record['_source']
        src['uid'] = record['_id']
        return self._deserialize_node(src)

    def _check_ik_plugin(self):
        try:
            plugins = self._client.cat.plugins(format='json')
            if any('analysis-ik' in p.get('component', '') for p in plugins):
                return True
            try:
                self._client.indices.analyze(
                    body={
                        'analyzer': 'ik_max_word',
                        'text': 'machine learning'
                    }
                )
                return True
            except Exception as e:
                LOG.warning(f'IK plugin is not installed: {str(e)}')
                return False
        except Exception as e:
            LOG.warning(f'check IK plugin failed: {e}')
            return False

    def _adapt_mapping_for_global_metadata(self) -> dict:
        check_ik = self._check_ik_plugin()
        if check_ik:
            LOG.info('IK plugin is installed')
        else:
            LOG.warning('IK plugin is not installed, OpenSearch will \
                use ngram analyzer which is English Only Analyzer')

        if not self._global_metadata_desc or self._global_metadata_desc == BUILDIN_GLOBAL_META_DESC:
            mapping = copy.deepcopy(DEFAULT_MAPPING_BODY)
            if not check_ik:
                content_field = mapping['mappings']['properties'].get('content', {})
                if content_field.get('analyzer') == 'ik_max_word':
                    content_field['analyzer'] = 'edge_ngram_analyzer'
                if content_field.get('search_analyzer') == 'ik_smart':
                    content_field['search_analyzer'] = 'standard'
                mapping['mappings']['properties']['content'] = content_field
            return mapping

        mapping = copy.deepcopy(DEFAULT_MAPPING_BODY)
        mapping['mappings']['dynamic'] = 'true'
        props = {'uid': {'type': 'keyword'}}
        self._type2os = {
            DataType.VARCHAR: 'text',
            DataType.ARRAY: 'array',
            DataType.INT32: 'integer',
            DataType.BOOLEAN: 'boolean',
            DataType.FLOAT: 'float',
            DataType.INT64: 'long',
            DataType.STRING: 'text',
        }

        for field_name, desc in self._global_metadata_desc.items():
            field_type = self._type2os.get(desc.data_type, None)
            if not field_type:
                LOG.warning(f'Unsupported data type: {desc.data_type}')
                continue
            field_def = {'type': field_type, 'store': True, 'index': True}
            if field_type == 'text':
                field_def['fields'] = {
                    'keyword': {
                        'type': 'keyword',
                        'ignore_above': 256
                    }
                }
                if check_ik:
                    field_def['analyzer'] = 'ik_max_word'
                    field_def['search_analyzer'] = 'ik_smart'
                else:
                    field_def['analyzer'] = 'edge_ngram_analyzer'
                    field_def['search_analyzer'] = 'standard'
            props[field_name] = field_def
        mapping['mappings']['properties'] = props

        return mapping

`connect(global_metadata_desc=None, **kwargs)`

Connect to OpenSearch service.

Initialize OpenSearch client connection, configure authentication information.

Parameters:

*args –

Positional arguments
**kwargs –

Keyword arguments

Returns:

None –

None

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

    @override
    def connect(self, global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs) -> None:
        """Connect to OpenSearch service.

Initialize OpenSearch client connection, configure authentication information.

Args:
    *args: Positional arguments
    **kwargs: Keyword arguments

Returns:
    None
"""
        if self._client_kwargs.get('user') and self._client_kwargs.get('password'):
            self._client_kwargs['http_auth'] = (self._client_kwargs.pop('user'), self._client_kwargs.pop('password'))
        self._ddl_lock = threading.Lock()
        self._client = opensearchpy.OpenSearch(hosts=self._uris, **self._client_kwargs)
        self._global_metadata_desc = global_metadata_desc or {}
        self._index_kwargs = self._adapt_mapping_for_global_metadata()

`delete(collection_name, criteria=None, **kwargs)`

Delete data from OpenSearch.

Delete data from specified collection based on criteria, supports batch deletion and index deletion.

Parameters:

collection_name (str) –

Collection name (index name)
criteria (Optional[dict], default: None ) –

Delete criteria
**kwargs –

Other delete parameters

Returns:

bool ( bool ) –

Whether operation succeeded

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

    @override
    def delete(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> bool:
        """Delete data from OpenSearch.

Delete data from specified collection based on criteria, supports batch deletion and index deletion.

Args:
    collection_name (str): Collection name (index name)
    criteria (Optional[dict]): Delete criteria
    **kwargs: Other delete parameters

Returns:
    bool: Whether operation succeeded
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[OpenSearchStore - delete] Index {collection_name} does not exist')
                return True
            if not criteria:
                with self._ddl_lock:
                    if self._client.indices.exists(index=collection_name):
                        self._client.indices.delete(index=collection_name)
                return True
            else:
                resp = self._client.delete_by_query(index=collection_name, body=self._construct_criteria(criteria),
                                                    refresh=True, conflicts='proceed')
                if resp.get('version_conflicts', 0) > 0:
                    LOG.warning(f'[OpenSearchStore - delete] Version conflicts: {resp.get("version_conflicts")}')
                if resp.get('failures'):
                    raise ValueError(f'Error deleting data from OpenSearch: {resp["failures"]}')
                return True
        except Exception as e:
            LOG.error(f'[OpenSearchStore - delete] Error deleting data from OpenSearch: {e}')
            return False

`get(collection_name, criteria=None, **kwargs)`

Query data from OpenSearch.

Query data from specified collection based on criteria, supports primary key query and complex condition query.

Parameters:

collection_name (str) –

Collection name (index name)
criteria (Optional[dict], default: None ) –

Query criteria
**kwargs –

Other query parameters

Returns:

List[dict] –

List[dict]: Query result data list

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

    @override
    def get(self, collection_name: str, criteria: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """Query data from OpenSearch.

Query data from specified collection based on criteria, supports primary key query and complex condition query.

Args:
    collection_name (str): Collection name (index name)
    criteria (Optional[dict]): Query criteria
    **kwargs: Other query parameters

Returns:
    List[dict]: Query result data list
"""
        try:
            if not self._client.indices.exists(index=collection_name):
                LOG.warning(f'[OpenSearchStore - get] Index {collection_name} does not exist')
                return []
            results: List[dict] = []
            criteria = dict(criteria) if criteria else {}
            limit = kwargs.get('limit')
            offset = max(kwargs.get('offset', 0) or 0, 0)
            return_total = kwargs.get('return_total', False)
            sort_by_number = kwargs.get('sort_by_number', False)
            if criteria and self._primary_key in criteria:
                vals = criteria.pop(self._primary_key)
                if not isinstance(vals, list):
                    vals = [vals]
                body = {'ids': vals}
                resp = self._client.mget(index=collection_name, body=body)
                for doc in resp['docs']:
                    if doc.get('found', False):
                        results.append(self._transform_segment(doc))
                if sort_by_number:
                    results = sorted(results, key=lambda item: (item.get('number', 0), item.get('uid', '')))
                total = len(results)
                if offset > 0 or limit is not None:
                    end = None if limit is None else offset + limit
                    results = results[offset:end]
                return (results, total) if return_total else results
            elif sort_by_number and (limit is not None or offset > 0 or return_total):
                body = self._construct_criteria(criteria) or {'query': {'match_all': {}}}
                body['sort'] = [{'number': {'order': 'asc'}}, {'_id': {'order': 'asc'}}]
                if offset > 0:
                    body['from'] = offset
                if limit is not None:
                    body['size'] = limit
                elif offset > 0:
                    body['size'] = 10000
                if return_total:
                    body['track_total_hits'] = True
                resp = self._client.search(index=collection_name, body=body)
                results = [self._transform_segment(hit) for hit in resp['hits']['hits']]
                total = resp['hits']['total']['value'] if return_total else len(results)
                return (results, total) if return_total else results
            else:
                spec = importlib.util.find_spec('opensearchpy.helpers')
                helpers = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(helpers)
                query = self._construct_criteria(criteria)
                for hit in helpers.scan(
                    client=self._client,
                    index=collection_name,
                    query=query,
                    scroll='2m',
                    size=500,
                    preserve_order=False
                ):
                    seg = self._transform_segment(hit)
                    if seg:
                        results.append(seg)
            return results
        except Exception as e:
            LOG.error(f'[OpenSearchStore - get] Error getting data from OpenSearch: {e}')
            return []

`search(collection_name, query=None, topk=10, filters=None, **kwargs)`

Perform vector similarity search with optional metadata filtering. Args: collection_name (str): Collection to search. query (Optional[str]): Query string. topk (Optional[int]): Number of nearest neighbors. filters (Optional[dict]): Metadata filter map. kwargs: Other search parameters

Returns:

List[dict]: Return matching results list and similarity 'score'.

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

    @override
    def search(self, collection_name: str, query: Optional[str] = None,
               topk: Optional[int] = 10, filters: Optional[dict] = None, **kwargs) -> List[dict]:  # noqa: C901
        """Perform vector similarity search with optional metadata filtering.
Args:
    collection_name (str): Collection to search.
    query (Optional[str]): Query string.
    topk (Optional[int]): Number of nearest neighbors.
    filters (Optional[dict]): Metadata filter map.
    kwargs: Other search parameters

**Returns:**

- List[dict]: Return matching results list and similarity 'score'.
"""
        query_fields = ['*']
        try:
            self._ensure_index(collection_name)
            must_clauses = []
            os_query = {}
            text_query = {
                'multi_match': {
                    'query': query,
                    'fields': query_fields,
                }
            }
            must_clauses.append(text_query)

            filter_query = self._construct_criteria(filters) if filters else {}

            if must_clauses and filter_query:
                filter_must = filter_query['query']['bool']['must']
                os_query = {'query': {'bool': {'must': must_clauses + filter_must}}}
            elif must_clauses:
                os_query = {'query': {'bool': {'must': must_clauses}}}
            elif filter_query:
                os_query = filter_query
            else:
                os_query = {'query': {'match_all': {}}}

            os_query['size'] = topk

            resp = self._client.search(index=collection_name, body=os_query)
            res = []
            for hit in resp['hits']['hits']:
                seg = self._transform_segment(hit)
                if seg:
                    seg['score'] = hit.get('_score', 0.0)
                    res.append(seg)
            return res

        except Exception as e:
            LOG.error(f'[OpenSearchStore - search] Error searching data from OpenSearch: {e}')
            return []

`upsert(collection_name, data)`

Insert or update data to OpenSearch.

Batch insert or update document data to specified collection (index), supports automatic index creation.

Parameters:

collection_name (str) –

Collection name (index name)
data (List[dict]) –

Data list to insert

Returns:

bool ( bool ) –

Whether operation succeeded

Source code in lazyllm/tools/rag/store/segment/opensearch_store.py

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data to OpenSearch.

Batch insert or update document data to specified collection (index), supports automatic index creation.

Args:
    collection_name (str): Collection name (index name)
    data (List[dict]): Data list to insert

Returns:
    bool: Whether operation succeeded
"""
        if not data: return
        try:
            self._ensure_index(collection_name)
            for i in range(0, len(data), INSERT_BATCH_SIZE):
                bulk_data = []
                batch_data = data[i:i + INSERT_BATCH_SIZE]
                for segment in batch_data:
                    segment = self._serialize_node(segment)
                    bulk_data.append({'index': {'_index': collection_name, '_id': segment.get(self._primary_key)}})
                    bulk_data.append(segment)
                response = self._client.bulk(index=collection_name, body=bulk_data, refresh='wait_for')
                if response.get('errors'):
                    error_details = []
                    for item in (response.get('items') or []):
                        idx_err = (item.get('index') or {}).get('error')
                        if idx_err:
                            error_details.append(f'{idx_err.get("type", "unknown")}: {idx_err.get("reason", "?")}')
                    detail = '; '.join(error_details[:5]) if error_details else str(response.get('errors'))
                    raise ValueError(
                        f'Error upserting data to OpenSearch: {len(error_details) or "?"} error(s) in batch'
                        f'{f" — {detail}" if detail else ""}')
            return True
        except Exception as e:
            LOG.error(f'[OpenSearchStore - upsert] Error upserting data to OpenSearch: {e}')
            return False

`lazyllm.tools.rag.rerank.ModuleReranker`

Bases: Reranker

A reranker that uses trainable modules to reorder documents based on relevance to a query.

ModuleReranker is a specialized reranker that leverages trainable models (such as BGE-reranker, Cohere rerank, etc.) to improve the relevance of retrieved documents. It takes a list of documents and a query, then returns the documents reordered by their relevance scores.

Parameters:

name (str, default: 'ModuleReranker' ) –

The name of the reranker. Defaults to "ModuleReranker".
model (Union[Callable, str], default: None ) –

The reranking model. Can be either a model name (string) or a callable function.
target (Optional[str], default: None ) –

Defaults to None.
output_format (Optional[str], default: None ) –

The format for output processing. Defaults to None.
join (Union[bool, str], default: False ) –

Whether to join the results. Defaults to False.
**kwargs –

Additional keyword arguments passed to the reranker model.

Examples:

>>> from lazyllm.tools.rag.rerank import ModuleReranker, DocNode
>>> def simple_reranker(query, documents, top_n):
...     query_lower = query.lower()
...     scores = []
...     for i, doc in enumerate(documents):
...         score = sum(1 for word in query_lower.split() if word in doc)
...         scores.append((i, score))
...     scores.sort(key=lambda x: x[1], reverse=True)
...     return scores[:top_n]
>>> reranker = ModuleReranker(
...     model=simple_reranker,
...     topk=2
... )
>>> docs = [
...     DocNode(text="机器学习算法在数据分析中应用广泛"),
...     DocNode(text="深度学习模型需要大量训练数据"),
...     DocNode(text="自然语言处理技术发展迅速"),
...     DocNode(text="计算机视觉在自动驾驶中的应用")
... ]
>>> query = "机器学习"
>>> results = reranker.forward(docs, query)
>>> for i, doc in enumerate(results):
...     print(f"  {i+1}. : {doc.text}")
...     print(f"     相关性分数: {doc.relevance_score:.4f}")

Source code in lazyllm/tools/rag/rerank.py

@Reranker.register_reranker()
class ModuleReranker(Reranker):
    """A reranker that uses trainable modules to reorder documents based on relevance to a query.

ModuleReranker is a specialized reranker that leverages trainable models (such as BGE-reranker, Cohere rerank, etc.) to improve the relevance of retrieved documents. It takes a list of documents and a query, then returns the documents reordered by their relevance scores.

Args:
    name (str): The name of the reranker. Defaults to "ModuleReranker".
    model (Union[Callable, str]): The reranking model. Can be either a model name (string) or a callable function.
    target (Optional[str]): Defaults to None.
    output_format (Optional[str]): The format for output processing. Defaults to None.
    join (Union[bool, str]): Whether to join the results. Defaults to False.
    **kwargs: Additional keyword arguments passed to the reranker model.


Examples:
    >>> from lazyllm.tools.rag.rerank import ModuleReranker, DocNode
    >>> def simple_reranker(query, documents, top_n):
    ...     query_lower = query.lower()
    ...     scores = []
    ...     for i, doc in enumerate(documents):
    ...         score = sum(1 for word in query_lower.split() if word in doc)
    ...         scores.append((i, score))
    ...     scores.sort(key=lambda x: x[1], reverse=True)
    ...     return scores[:top_n]
    >>> reranker = ModuleReranker(
    ...     model=simple_reranker,
    ...     topk=2
    ... )
    >>> docs = [
    ...     DocNode(text="机器学习算法在数据分析中应用广泛"),
    ...     DocNode(text="深度学习模型需要大量训练数据"),
    ...     DocNode(text="自然语言处理技术发展迅速"),
    ...     DocNode(text="计算机视觉在自动驾驶中的应用")
    ... ]
    >>> query = "机器学习"
    >>> results = reranker.forward(docs, query)
    >>> for i, doc in enumerate(results):
    ...     print(f"  {i+1}. : {doc.text}")
    ...     print(f"     相关性分数: {doc.relevance_score:.4f}")
    """

    def __init__(self, name: str = 'ModuleReranker', model: Union[Callable, str] = None, target: Optional[str] = None,
                 output_format: Optional[str] = None, join: Union[bool, str] = False, **kwargs) -> None:
        super().__init__(name, target, output_format, join, **kwargs)
        assert model is not None, 'Reranker model must be specified as a model name or a callable.'
        if isinstance(model, str):
            self._model_name = model
            self._reranker = lazyllm.TrainableModule(model, type='rerank')
        else:
            self._reranker = model

    def forward(self, nodes: List[DocNode], query: str = '', topk: Optional[int] = None) -> List[DocNode]:
        """Forward pass of the reranker that reorders documents based on relevance to the query.

This method takes a list of documents and a query, then uses the underlying reranking model to score and reorder the documents by relevance. The documents are processed in MetadataMode.EMBED format to ensure compatibility with the reranking model.

Args:
    nodes (List[DocNode]): List of document nodes to be reranked.
    query (str): The query string to rank documents against. Defaults to "".

**Returns:**

- List[DocNode]: List of document nodes reordered by relevance score, with relevance_score attribute added.
"""
        if not nodes:
            return self._post_process([])

        docs = [node.get_text(metadata_mode=MetadataMode.EMBED) for node in nodes]
        top_n = topk if topk is not None else (self._kwargs['topk'] if 'topk' in self._kwargs else len(docs))
        sorted_indices = self._reranker(query, documents=docs, top_n=top_n)
        results = []
        for index, relevance_score in sorted_indices:
            results.append(nodes[index].with_score(relevance_score))
        LOG.debug(f'Rerank use `{self._name}` and get nodes: {results}')
        return self._post_process(results)

`forward(nodes, query='', topk=None)`

Forward pass of the reranker that reorders documents based on relevance to the query.

This method takes a list of documents and a query, then uses the underlying reranking model to score and reorder the documents by relevance. The documents are processed in MetadataMode.EMBED format to ensure compatibility with the reranking model.

Parameters:

nodes (List[DocNode]) –

List of document nodes to be reranked.
query (str, default: '' ) –

The query string to rank documents against. Defaults to "".

Returns:

List[DocNode]: List of document nodes reordered by relevance score, with relevance_score attribute added.

Source code in lazyllm/tools/rag/rerank.py

    def forward(self, nodes: List[DocNode], query: str = '', topk: Optional[int] = None) -> List[DocNode]:
        """Forward pass of the reranker that reorders documents based on relevance to the query.

This method takes a list of documents and a query, then uses the underlying reranking model to score and reorder the documents by relevance. The documents are processed in MetadataMode.EMBED format to ensure compatibility with the reranking model.

Args:
    nodes (List[DocNode]): List of document nodes to be reranked.
    query (str): The query string to rank documents against. Defaults to "".

**Returns:**

- List[DocNode]: List of document nodes reordered by relevance score, with relevance_score attribute added.
"""
        if not nodes:
            return self._post_process([])

        docs = [node.get_text(metadata_mode=MetadataMode.EMBED) for node in nodes]
        top_n = topk if topk is not None else (self._kwargs['topk'] if 'topk' in self._kwargs else len(docs))
        sorted_indices = self._reranker(query, documents=docs, top_n=top_n)
        results = []
        for index, relevance_score in sorted_indices:
            results.append(nodes[index].with_score(relevance_score))
        LOG.debug(f'Rerank use `{self._name}` and get nodes: {results}')
        return self._post_process(results)

`lazyllm.tools.rag.global_metadata.GlobalMetadataDesc`

A descriptor for global metadata, defining its type, optional element type, default value, and size constraints. class GlobalMetadataDesc This class is used to describe metadata properties such as type, optional constraints, and default values. It supports scalar and array data types, with specific size limitations for certain types.

Parameters:

data_type (int) –

The type of the metadata as an integer, representing various data types (e.g., VARCHAR, ARRAY, etc.).
element_type (Optional[int], default: None ) –

The type of individual elements if data_type is an array. Defaults to None.
default_value (Optional[Any], default: None ) –

The default value for the metadata. If not provided, the default will be None.
max_size (Optional[int], default: None ) –

The maximum size or length for the metadata. Required if data_type is VARCHAR or ARRAY.

Source code in lazyllm/tools/rag/global_metadata.py

class GlobalMetadataDesc:
    """A descriptor for global metadata, defining its type, optional element type, default value, and size constraints.
`class GlobalMetadataDesc`
This class is used to describe metadata properties such as type, optional constraints, and default values. It supports scalar and array data types, with specific size limitations for certain types.

Args:
    data_type (int): The type of the metadata as an integer, representing various data types (e.g., VARCHAR, ARRAY, etc.).
    element_type (Optional[int]): The type of individual elements if `data_type` is an array. Defaults to `None`.
    default_value (Optional[Any]): The default value for the metadata. If not provided, the default will be `None`.
    max_size (Optional[int]): The maximum size or length for the metadata. Required if `data_type` is `VARCHAR` or `ARRAY`.
"""
    # max_size MUST be set when data_type is DataType.VARCHAR or DataType.ARRAY
    def __init__(self, data_type: int, element_type: Optional[int] = None,
                 default_value: Optional[Any] = None, max_size: Optional[int] = None):
        self.data_type = data_type
        self.element_type = element_type
        self.default_value = default_value
        self.max_size = max_size

`lazyllm.tools.rag.IndexBase.update(nodes)` `abstractmethod`

Update index contents.

This method receives a list of document nodes and updates or inserts them into the index structure. Typically used for incremental indexing or refreshing data.

Parameters:

nodes (List[DocNode]) –

A list of document nodes to update or insert.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def update(self, nodes: List[DocNode]) -> None:
        """Update index contents.

This method receives a list of document nodes and updates or inserts them into the index structure. Typically used for incremental indexing or refreshing data.

Args:
    nodes (List[DocNode]): A list of document nodes to update or insert.
"""
        pass

`lazyllm.tools.rag.IndexBase.remove(uids, group_name=None)` `abstractmethod`

Remove specific document nodes from the index.

Removes document nodes based on their unique identifiers, optionally scoped by group name.

Parameters:

uids (List[str]) –

List of unique IDs corresponding to the document nodes to remove.
group_name (Optional[str], default: None ) –

Optional group name to scope the removal operation.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        """Remove specific document nodes from the index.

Removes document nodes based on their unique identifiers, optionally scoped by group name.

Args:
    uids (List[str]): List of unique IDs corresponding to the document nodes to remove.
    group_name (Optional[str]): Optional group name to scope the removal operation.
"""
        pass

`lazyllm.tools.rag.IndexBase.query(*args, **kwargs)` `abstractmethod`

Execute a query over the index.

Performs a query based on the given arguments and returns matching document nodes. Note: This method is a placeholder and should be implemented by subclasses.

Parameters:

*args –

Positional arguments for the query.
**kwargs –

Keyword arguments for the query.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def query(self, *args, **kwargs) -> List[DocNode]:
        """Execute a query over the index.

Performs a query based on the given arguments and returns matching document nodes.
**Note:** This method is a placeholder and should be implemented by subclasses.

Args:
    *args: Positional arguments for the query.
    **kwargs: Keyword arguments for the query.
"""
        pass

`lazyllm.tools.rag.index_base.IndexBase`

Bases: ABC

An abstract base class for implementing indexing systems that support updating, removing, and querying document nodes. class IndexBase(ABC) This abstract base class defines the interface for an indexing system. It requires subclasses to implement methods for updating, removing, and querying document nodes.

Examples:

>>> from mymodule import IndexBase, DocNode
>>> class MyIndex(IndexBase):
...     def __init__(self):
...         self.nodes = []
...     def update(self, nodes):
...         self.nodes.extend(nodes)
...         print(f"Updated nodes: {nodes}")
...     def remove(self, uids, group_name=None):
...         self.nodes = [node for node in self.nodes if node.uid not in uids]
...         print(f"Removed nodes with uids: {uids}")
...     def query(self, *args, **kwargs):
...         print("Querying nodes...")
...         return self.nodes
>>> index = MyIndex()
>>> doc1 = DocNode(uid="1", content="Document 1")
>>> doc2 = DocNode(uid="2", content="Document 2")
>>> index.update([doc1, doc2])
Updated nodes: [DocNode(uid="1", content="Document 1"), DocNode(uid="2", content="Document 2")]
>>> index.query()
Querying nodes...
[DocNode(uid="1", content="Document 1"), DocNode(uid="2", content="Document 2")]
>>> index.remove(["1"])
Removed nodes with uids: ['1']
>>> index.query()
Querying nodes...
[DocNode(uid="2", content="Document 2")]

Source code in lazyllm/tools/rag/index_base.py

class IndexBase(ABC):
    """An abstract base class for implementing indexing systems that support updating, removing, and querying document nodes.
`class IndexBase(ABC)`
This abstract base class defines the interface for an indexing system. It requires subclasses to implement methods for updating, removing, and querying document nodes.


Examples:
    >>> from mymodule import IndexBase, DocNode
    >>> class MyIndex(IndexBase):
    ...     def __init__(self):
    ...         self.nodes = []
    ...     def update(self, nodes):
    ...         self.nodes.extend(nodes)
    ...         print(f"Updated nodes: {nodes}")
    ...     def remove(self, uids, group_name=None):
    ...         self.nodes = [node for node in self.nodes if node.uid not in uids]
    ...         print(f"Removed nodes with uids: {uids}")
    ...     def query(self, *args, **kwargs):
    ...         print("Querying nodes...")
    ...         return self.nodes
    >>> index = MyIndex()
    >>> doc1 = DocNode(uid="1", content="Document 1")
    >>> doc2 = DocNode(uid="2", content="Document 2")
    >>> index.update([doc1, doc2])
    Updated nodes: [DocNode(uid="1", content="Document 1"), DocNode(uid="2", content="Document 2")]
    >>> index.query()
    Querying nodes...
    [DocNode(uid="1", content="Document 1"), DocNode(uid="2", content="Document 2")]
    >>> index.remove(["1"])
    Removed nodes with uids: ['1']
    >>> index.query()
    Querying nodes...
    [DocNode(uid="2", content="Document 2")]
    """
    # TODO(chenjiahao): change params `nodes` to `segments`, index should be able to handle segments
    @abstractmethod
    def update(self, nodes: List[DocNode]) -> None:
        """Update index contents.

This method receives a list of document nodes and updates or inserts them into the index structure. Typically used for incremental indexing or refreshing data.

Args:
    nodes (List[DocNode]): A list of document nodes to update or insert.
"""
        pass

    @abstractmethod
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        """Remove specific document nodes from the index.

Removes document nodes based on their unique identifiers, optionally scoped by group name.

Args:
    uids (List[str]): List of unique IDs corresponding to the document nodes to remove.
    group_name (Optional[str]): Optional group name to scope the removal operation.
"""
        pass

    @abstractmethod
    def query(self, *args, **kwargs) -> List[DocNode]:
        """Execute a query over the index.

Performs a query based on the given arguments and returns matching document nodes.
**Note:** This method is a placeholder and should be implemented by subclasses.

Args:
    *args: Positional arguments for the query.
    **kwargs: Keyword arguments for the query.
"""
        pass

`query(*args, **kwargs)` `abstractmethod`

Execute a query over the index.

Performs a query based on the given arguments and returns matching document nodes. Note: This method is a placeholder and should be implemented by subclasses.

Parameters:

*args –

Positional arguments for the query.
**kwargs –

Keyword arguments for the query.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def query(self, *args, **kwargs) -> List[DocNode]:
        """Execute a query over the index.

Performs a query based on the given arguments and returns matching document nodes.
**Note:** This method is a placeholder and should be implemented by subclasses.

Args:
    *args: Positional arguments for the query.
    **kwargs: Keyword arguments for the query.
"""
        pass

`remove(uids, group_name=None)` `abstractmethod`

Remove specific document nodes from the index.

Removes document nodes based on their unique identifiers, optionally scoped by group name.

Parameters:

uids (List[str]) –

List of unique IDs corresponding to the document nodes to remove.
group_name (Optional[str], default: None ) –

Optional group name to scope the removal operation.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def remove(self, uids: List[str], group_name: Optional[str] = None) -> None:
        """Remove specific document nodes from the index.

Removes document nodes based on their unique identifiers, optionally scoped by group name.

Args:
    uids (List[str]): List of unique IDs corresponding to the document nodes to remove.
    group_name (Optional[str]): Optional group name to scope the removal operation.
"""
        pass

`update(nodes)` `abstractmethod`

Update index contents.

This method receives a list of document nodes and updates or inserts them into the index structure. Typically used for incremental indexing or refreshing data.

Parameters:

nodes (List[DocNode]) –

A list of document nodes to update or insert.

Source code in lazyllm/tools/rag/index_base.py

    @abstractmethod
    def update(self, nodes: List[DocNode]) -> None:
        """Update index contents.

This method receives a list of document nodes and updates or inserts them into the index structure. Typically used for incremental indexing or refreshing data.

Args:
    nodes (List[DocNode]): A list of document nodes to update or insert.
"""
        pass

`lazyllm.tools.BaseEvaluator`

Bases: ModuleBase

Abstract base class for evaluation modules.

This class defines the standard interface and retry logic for evaluating model outputs. It supports concurrent processing, input validation, and automatic result saving.

Parameters:

concurrency (int, default: 1 ) –

Number of concurrent threads used during evaluation.
retry (int, default: 3 ) –

Number of retry attempts for each evaluation item.
log_base_name (Optional[str], default: None ) –

Optional log file name prefix for saving results.

Examples:

>>> from lazyllm.components import BaseEvaluator
>>> class SimpleAccuracyEvaluator(BaseEvaluator):
...     def _process_one_data_impl(self, data):
...         return {
...             "final_score": float(data["pred"] == data["label"])
...         }
>>> evaluator = SimpleAccuracyEvaluator()
>>> score = evaluator([
...     {"pred": "yes", "label": "yes"},
...     {"pred": "no", "label": "yes"}
... ])
>>> print(score)
... 0.5

Source code in lazyllm/tools/eval/eval_base.py

class BaseEvaluator(ModuleBase):
    """Abstract base class for evaluation modules.

This class defines the standard interface and retry logic for evaluating model outputs. It supports concurrent processing, input validation, and automatic result saving.

Args:
    concurrency (int): Number of concurrent threads used during evaluation.
    retry (int): Number of retry attempts for each evaluation item.
    log_base_name (Optional[str]): Optional log file name prefix for saving results.


Examples:
    >>> from lazyllm.components import BaseEvaluator
    >>> class SimpleAccuracyEvaluator(BaseEvaluator):
    ...     def _process_one_data_impl(self, data):
    ...         return {
    ...             "final_score": float(data["pred"] == data["label"])
    ...         }
    >>> evaluator = SimpleAccuracyEvaluator()
    >>> score = evaluator([
    ...     {"pred": "yes", "label": "yes"},
    ...     {"pred": "no", "label": "yes"}
    ... ])
    >>> print(score)
    ... 0.5
    """
    def __init__(self, concurrency=1, retry=3, log_base_name=None):
        super().__init__()
        self._concurrency = concurrency
        self._retry = retry
        self._lock = threading.Lock()
        self._warp = warp(self.process_one_data, _concurrent=self._concurrency)
        self._necessary_keys = []

    def _execute_with_retries(self, input_data, func, result_validator=None, post_processor=None):
        for attempt in range(1, self._retry + 1):
            try:
                result = func(input_data)
                if post_processor is not None:
                    result = post_processor(result)
                if result_validator is None or result_validator(result):
                    return result
                lazyllm.LOG.warning(f'Validation failed on attempt {attempt}/{self._retry}')
            except Exception as e:
                lazyllm.LOG.error(f'Attempt {attempt}/{self._retry} failed: {str(e)}')
        lazyllm.LOG.error(f'All {self._retry} attempts exhausted')
        return ''

    def forward(self, data):
        if not data:
            lazyllm.LOG.warning('Empty input data received')
            return 0.0

        with tqdm(total=len(data), desc=self.__class__.__name__.title()) as progress_bar:
            results = self.batch_process(data, progress_bar)

        if not results:
            return 0.0

        total_score = sum(item.get('final_score', 0) for item in results)
        return total_score / len(results)

    def process_one_data(self, data, progress_bar=None):
        """Process a single data item.

Args:
    data: Data item to process.
    progress_bar (Optional[tqdm]): Progress bar object, defaults to None.

**Returns:**

- Any: Returns processing result.

Note:
    This method automatically updates the progress bar during processing and uses thread lock to ensure thread safety.
"""
        res = self._process_one_data_impl(data)
        if progress_bar is not None:
            with self._lock:
                progress_bar.update(1)
        return res

    @abc.abstractmethod
    def _process_one_data_impl(self, data):
        pass

    def validate_inputs_key(self, data):
        """Validate input data format and required keys.

Args:
    data: Data to validate.

Raises:
    RuntimeError: Raised when data format is incorrect or missing required keys.
        - If data is not a list
        - If items in the list are not dictionaries
        - If dictionaries are missing required keys
"""
        if not isinstance(data, list):
            raise RuntimeError(f'The data should be a list, but got {type(data)}')
        for i, item in enumerate(data):
            if not isinstance(item, dict):
                raise RuntimeError(f'The item at index {i} should be a dict, but got {type(item)}')
            missing_keys = [key for key in self._necessary_keys if key not in item]
            if missing_keys:
                raise RuntimeError(
                    f'The dict at index {i} should contain '
                    f'keys: {self._necessary_keys}, but cannot find: {missing_keys}')

    def batch_process(self, data, progress_bar):
        """Process data in batch.

Args:
    data: List of data to process.
    progress_bar (tqdm): Progress bar object.

**Returns:**

- List: Returns list of processing results.

Flow:
    1. Validates input data format and required keys
    2. Processes data using concurrent processor
    3. Saves processing results
"""
        self.validate_inputs_key(data)
        results = self._warp(data, progress_bar=progress_bar)
        self.save_res(results)
        return results

    def save_res(self, data, eval_res_save_name=None):
        """Save evaluation results.

Args:
    data: Data to save.
    eval_res_save_name (Optional[str]): Base name for the save file, defaults to class name.

Save Format:
    - Filename format: {filename}_{timestamp}.json
    - Timestamp format: YYYYMMDDHHmmSS
    - Save path: lazyllm.config['eval_result_dir']
    - JSON format with 4-space indentation
"""
        save_dir = lazyllm.config['eval_result_dir']
        os.makedirs(save_dir, exist_ok=True)

        filename = eval_res_save_name or self.__class__.__name__
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        save_path = os.path.join(save_dir, f'{filename}_{timestamp}.json')
        try:
            with open(save_path, 'w') as file:
                json.dump(data, file, ensure_ascii=False, indent=4)
        except Exception as e:
            lazyllm.LOG.error(f'Dump Json error: {e}')

`batch_process(data, progress_bar)`

Process data in batch.

Parameters:

data –

List of data to process.
progress_bar (tqdm) –

Progress bar object.

Returns:

List: Returns list of processing results.

Flow

Validates input data format and required keys
Processes data using concurrent processor
Saves processing results

Source code in lazyllm/tools/eval/eval_base.py

    def batch_process(self, data, progress_bar):
        """Process data in batch.

Args:
    data: List of data to process.
    progress_bar (tqdm): Progress bar object.

**Returns:**

- List: Returns list of processing results.

Flow:
    1. Validates input data format and required keys
    2. Processes data using concurrent processor
    3. Saves processing results
"""
        self.validate_inputs_key(data)
        results = self._warp(data, progress_bar=progress_bar)
        self.save_res(results)
        return results

`process_one_data(data, progress_bar=None)`

Process a single data item.

Parameters:

data –

Data item to process.
progress_bar (Optional[tqdm], default: None ) –

Progress bar object, defaults to None.

Returns:

Any: Returns processing result.

Note

This method automatically updates the progress bar during processing and uses thread lock to ensure thread safety.

Source code in lazyllm/tools/eval/eval_base.py

    def process_one_data(self, data, progress_bar=None):
        """Process a single data item.

Args:
    data: Data item to process.
    progress_bar (Optional[tqdm]): Progress bar object, defaults to None.

**Returns:**

- Any: Returns processing result.

Note:
    This method automatically updates the progress bar during processing and uses thread lock to ensure thread safety.
"""
        res = self._process_one_data_impl(data)
        if progress_bar is not None:
            with self._lock:
                progress_bar.update(1)
        return res

`save_res(data, eval_res_save_name=None)`

Save evaluation results.

Parameters:

data –

Data to save.
eval_res_save_name (Optional[str], default: None ) –

Base name for the save file, defaults to class name.

Save Format

Filename format: {filename}_{timestamp}.json
Timestamp format: YYYYMMDDHHmmSS
Save path: lazyllm.config['eval_result_dir']
JSON format with 4-space indentation

Source code in lazyllm/tools/eval/eval_base.py

    def save_res(self, data, eval_res_save_name=None):
        """Save evaluation results.

Args:
    data: Data to save.
    eval_res_save_name (Optional[str]): Base name for the save file, defaults to class name.

Save Format:
    - Filename format: {filename}_{timestamp}.json
    - Timestamp format: YYYYMMDDHHmmSS
    - Save path: lazyllm.config['eval_result_dir']
    - JSON format with 4-space indentation
"""
        save_dir = lazyllm.config['eval_result_dir']
        os.makedirs(save_dir, exist_ok=True)

        filename = eval_res_save_name or self.__class__.__name__
        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
        save_path = os.path.join(save_dir, f'{filename}_{timestamp}.json')
        try:
            with open(save_path, 'w') as file:
                json.dump(data, file, ensure_ascii=False, indent=4)
        except Exception as e:
            lazyllm.LOG.error(f'Dump Json error: {e}')

`validate_inputs_key(data)`

Validate input data format and required keys.

Parameters:

data –

Data to validate.

Raises:

RuntimeError –

Raised when data format is incorrect or missing required keys. - If data is not a list - If items in the list are not dictionaries - If dictionaries are missing required keys

Source code in lazyllm/tools/eval/eval_base.py

    def validate_inputs_key(self, data):
        """Validate input data format and required keys.

Args:
    data: Data to validate.

Raises:
    RuntimeError: Raised when data format is incorrect or missing required keys.
        - If data is not a list
        - If items in the list are not dictionaries
        - If dictionaries are missing required keys
"""
        if not isinstance(data, list):
            raise RuntimeError(f'The data should be a list, but got {type(data)}')
        for i, item in enumerate(data):
            if not isinstance(item, dict):
                raise RuntimeError(f'The item at index {i} should be a dict, but got {type(item)}')
            missing_keys = [key for key in self._necessary_keys if key not in item]
            if missing_keys:
                raise RuntimeError(
                    f'The dict at index {i} should contain '
                    f'keys: {self._necessary_keys}, but cannot find: {missing_keys}')

`lazyllm.tools.ResponseRelevancy`

Bases: BaseEvaluator

Evaluator for measuring the semantic relevancy between a user-generated question and a model-generated one.

This evaluator uses a language model to generate possible questions from an answer, and measures their semantic similarity to the original question using embeddings and cosine similarity.

Parameters:

llm (ModuleBase) –

A language model used to generate inferred questions from the given answer.
embedding (ModuleBase) –

An embedding module to encode questions for similarity comparison.
prompt (str, default: None ) –

Custom prompt to guide the question generation. If not provided, a default will be used.
prompt_lang (str, default: 'en' ) –

Language for the default prompt. Options: 'en' (default) or 'zh'.
num_infer_questions (int, default: 3 ) –

Number of questions to generate and evaluate for each answer.
retry (int, default: 3 ) –

Number of retry attempts if generation fails.
concurrency (int, default: 1 ) –

Number of concurrent evaluations.

Examples:

>>> from lazyllm.components import ResponseRelevancy
>>> relevancy = ResponseRelevancy(
...     llm=YourLLM(),
...     embedding=YourEmbedding(),
...     prompt_lang="en",
...     num_infer_questions=3
... )
>>> result = relevancy([
...     {"question": "What is the capital of France?", "answer": "Paris is the capital city of France."}
... ])
>>> print(result)
... 0.95  # (a float score between 0 and 1)

Source code in lazyllm/tools/eval/rag_generator_metrics.py

class ResponseRelevancy(BaseEvaluator):
    """Evaluator for measuring the semantic relevancy between a user-generated question and a model-generated one.

This evaluator uses a language model to generate possible questions from an answer, and measures their semantic similarity to the original question using embeddings and cosine similarity.

Args:
    llm (ModuleBase): A language model used to generate inferred questions from the given answer.
    embedding (ModuleBase): An embedding module to encode questions for similarity comparison.
    prompt (str, optional): Custom prompt to guide the question generation. If not provided, a default will be used.
    prompt_lang (str): Language for the default prompt. Options: `'en'` (default) or `'zh'`.
    num_infer_questions (int): Number of questions to generate and evaluate for each answer.
    retry (int): Number of retry attempts if generation fails.
    concurrency (int): Number of concurrent evaluations.


Examples:
    >>> from lazyllm.components import ResponseRelevancy
    >>> relevancy = ResponseRelevancy(
    ...     llm=YourLLM(),
    ...     embedding=YourEmbedding(),
    ...     prompt_lang="en",
    ...     num_infer_questions=3
    ... )
    >>> result = relevancy([
    ...     {"question": "What is the capital of France?", "answer": "Paris is the capital city of France."}
    ... ])
    >>> print(result)
    ... 0.95  # (a float score between 0 and 1)
    """
    _default_generate_prompt_en = (
        'Please generate the most likely question based on '
        'the input, keeping it concise and to the point.')
    _default_generate_prompt_zh = ('请根据输入生成最可能的一个问题，保持简洁明了。')

    def __init__(self, llm, embedding, prompt=None, prompt_lang='en',
                 num_infer_questions=3, retry=3, concurrency=1):
        super().__init__(concurrency, retry)
        if prompt_lang.strip().lower() == 'zh':
            default_prompt = self._default_generate_prompt_zh
        else:
            default_prompt = self._default_generate_prompt_en
        self._llm = llm.prompt(prompt or default_prompt)
        self._embedding = embedding
        self._num_infer_questions = num_infer_questions
        self._necessary_keys = ['question', 'answer']

    def _cosine(self, x, y):
        product = np.dot(x, y)
        norm = np.linalg.norm(x) * np.linalg.norm(y)
        raw_cosine = product / norm if norm != 0 else 0.0
        return max(0.0, min(raw_cosine, 1.0))

    def _process_one_data_impl(self, data):
        one_total_score = 0
        res = copy.deepcopy(data)
        res['infer_questions'] = []
        for _ in range(self._num_infer_questions):
            # Generate Questions:
            guess_question = self._execute_with_retries(data['answer'], self._llm)

            # Calculate Similarity:
            try:
                if isinstance(self._embedding, lazyllm.module.OnlineEmbeddingModuleBase):
                    vector1 = self._embedding(guess_question)
                    vector2 = self._embedding(data['question'])
                else:
                    vector1, vector2 = json.loads(self._embedding([guess_question, data['question']]))
                score = self._cosine(vector1, vector2)
            except Exception as e:
                lazyllm.LOG.error(f'Eval-Infer Error: {e}')
                score = 0
            res['infer_questions'].append({
                'question': guess_question,
                'score': round(score, 4)
            })
            one_total_score += score
        res['final_score'] = round(one_total_score / self._num_infer_questions, 4)
        return res

`lazyllm.tools.Faithfulness`

Bases: BaseEvaluator

Evaluator that measures the factual consistency of an answer with the given context.

This evaluator splits the answer into atomic factual statements using a generation model, then verifies each against the context using binary (1/0) scoring. It computes a final score as the average of the individual statement scores.

Parameters:

llm (ModuleBase) –

A language model capable of both generating statements and evaluating them.
generate_prompt (str, default: None ) –

Custom prompt to generate factual statements from the answer.
eval_prompt (str, default: None ) –

Custom prompt to evaluate statement support within the context.
prompt_lang (str, default: 'en' ) –

Language of the default prompt, either 'en' or 'zh'.
retry (int, default: 3 ) –

Number of retry attempts when generation or evaluation fails.
concurrency (int, default: 1 ) –

Number of concurrent evaluations to run in parallel.

Examples:

>>> from lazyllm.components import Faithfulness
>>> evaluator = Faithfulness(llm=YourLLM(), prompt_lang="en")
>>> data = {
...     "question": "What is the role of ATP in cells?",
...     "answer": "ATP stores energy and transfers it within cells.",
...     "context": "ATP is the energy currency of the cell. It provides energy for many biochemical reactions."
... }
>>> result = evaluator([data])
>>> print(result)
... 1.0  # Average binary score of all factual statements

Source code in lazyllm/tools/eval/rag_generator_metrics.py

class Faithfulness(BaseEvaluator):
    """Evaluator that measures the factual consistency of an answer with the given context.

This evaluator splits the answer into atomic factual statements using a generation model, then verifies each against the context using binary (1/0) scoring. It computes a final score as the average of the individual statement scores.

Args:
    llm (ModuleBase): A language model capable of both generating statements and evaluating them.
    generate_prompt (str, optional): Custom prompt to generate factual statements from the answer.
    eval_prompt (str, optional): Custom prompt to evaluate statement support within the context.
    prompt_lang (str): Language of the default prompt, either 'en' or 'zh'.
    retry (int): Number of retry attempts when generation or evaluation fails.
    concurrency (int): Number of concurrent evaluations to run in parallel.


Examples:
    >>> from lazyllm.components import Faithfulness
    >>> evaluator = Faithfulness(llm=YourLLM(), prompt_lang="en")
    >>> data = {
    ...     "question": "What is the role of ATP in cells?",
    ...     "answer": "ATP stores energy and transfers it within cells.",
    ...     "context": "ATP is the energy currency of the cell. It provides energy for many biochemical reactions."
    ... }
    >>> result = evaluator([data])
    >>> print(result)
    ... 1.0  # Average binary score of all factual statements
    """
    _default_generate_prompt_en = (
        '[Task Description]\n'
        'Split the answer into independent factual statements using "|||" as '
        'the exclusive separator, following these rules:\n'
        '1. Each statement must be a complete sentence ending with proper punctuation\n'
        '2. Never use line breaks or other symbols as separators\n'
        '3. Statements containing "|||" must be rephrased\n'
        '4. Each statement must be clear, pronoun-free.\n'
        '[Output Format]\n'
        'statement_1|||statement_2|||statement_3\n'
        '[Example Input]\n'
        'Q: How does photosynthesis work?\n'
        'A: The process requires sunlight, then chlorophyll absorbs light energy. '
        'It converts water and CO2 into glucose.\n'
        '[Example Output]\n'
        'Photosynthesis requires sunlight.|||Chlorophyll absorbs light energy.'
        '|||Chlorophyll converts water and CO2 into glucose.\n'
    )
    _default_eval_prompt_en = (
        '[Task Description]\n'
        'Evaluate each "|||"-separated statement against provided context using binary scoring:\n'
        'Fully supported by context: 1\n'
        'Unsupported/contradictory: 0\n'
        '[Output Requirements]\n'
        '1. JSON format with array of objects\n'
        '2. Each object contains:\n'
        '    - "statement": Original text\n'
        '    - "score": 1 or 0\n'
        '3. Wrap output in ```json code block\n'
        '[Example Input]\n'
        'Context: Photosynthesis occurs in chloroplasts. Light reactions produce ATP using sunlight. '
        'Calvin cycle fixes CO2 into sugars.\n'
        'Statements: Photosynthesis requires sunlight.|||Chlorophyll absorbs light energy.'
        '|||Chlorophyll converts water and CO2 into glucose.\n'
        '[Example Output]\n'
        '[{"statement": "Photosynthesis requires sunlight.","score": 1},'
        '{"statement": "Chlorophyll absorbs light energy.", "score": 1},'
        '{"statement": "Chlorophyll converts water and CO2 into glucose.","score": 0}]\n'
    )
    _default_generate_prompt_zh = (
        '[任务描述]\n'
        '使用"|||"作为唯一分隔符，将答案分割成独立的事实陈述，遵循以下规则：\n'
        '1. 每个陈述必须是完整的句子，并以适当的标点结束\n'
        '2. 不要使用换行符或其他符号作为分隔符\n'
        '3. 包含"|||"的陈述必须重新措辞\n'
        '4. 每个陈述必须清晰，不包含代词。\n'
        '[输出格式]\n'
        'statement_1|||statement_2|||statement_3\n'
        '[示例输入]\n'
        'Q: 光合作用是如何工作的？\n'
        'A: 该过程需要阳光，然后叶绿素吸收光能。它将水和CO2转化为葡萄糖。\n'
        '[示例输出]\n'
        '光合作用需要阳光。|||叶绿素吸收光能。|||叶绿素将水和CO2转化为葡萄糖。\n'
    )
    _default_eval_prompt_zh = (
        '[任务描述]\n'
        '使用二进制评分对每个"|||"分隔的陈述与提供的内容进行评估：\n'
        '完全由内容支持：1\n'
        '不支持/矛盾：0\n'
        '[输出要求]\n'
        '1. JSON格式，包含对象数组\n'
        '2. 每个对象包含：\n'
        '    - "statement": 原始文本\n'
        '    - "score": 1或0\n'
        '3. 将输出包裹在```json代码块中\n'
        '[示例输入]\n'
        'Context: 光合作用发生在叶绿体中。光反应利用阳光产生ATP。卡尔文循环将CO2固定成糖。\n'
        'Statements: 光合作用需要阳光。|||叶绿素吸收光能。|||叶绿素将水和CO2转化为葡萄糖。\n'
        '[示例输出]\n'
        '[{"statement": "光合作用需要阳光。","score": 1},'
        '{"statement": "叶绿素吸收光能。", "score": 1},'
        '{"statement": "叶绿素将水和CO2转化为葡萄糖。","score": 0}]\n'
    )

    def __init__(self, llm, generate_prompt=None, eval_prompt=None, prompt_lang='en', retry=3, concurrency=1):
        super().__init__(concurrency, retry)
        self._base_llm = llm
        if prompt_lang == 'zh':
            default_generate_prompt = generate_prompt or self._default_generate_prompt_zh
            default_eval_prompt = eval_prompt or self._default_eval_prompt_zh
        else:
            default_generate_prompt = generate_prompt or self._default_generate_prompt_en
            default_eval_prompt = eval_prompt or self._default_eval_prompt_en
        self._build_llms(self._base_llm, default_generate_prompt, default_eval_prompt)
        self._necessary_keys = ['question', 'answer', 'context']

    def _build_llms(self, base_llm, generate_prompt, eval_prompt):
        self._gene_llm = base_llm.share(prompt=generate_prompt)
        self._eval_llm = base_llm.share(prompt=eval_prompt).formatter(JsonFormatter())

    def _validate_eval_result(self, result):
        return (
            isinstance(result, list)
            and len(result) > 0
            and all(isinstance(i, dict) and 'score' in i for i in result)
        )

    def _post_processor(self, eval_result):
        if isinstance(eval_result, dict):
            eval_result = [eval_result]
        return eval_result

    def _process_one_data_impl(self, data):
        res = copy.deepcopy(data)
        # Generate Statements:
        query1 = f'Q: {data["question"]}\nA: {data["answer"]}'
        statements = self._execute_with_retries(query1, self._gene_llm)
        res['statements'] = statements

        # Eval Statements in Context:
        query2 = f'Context: {data["context"]}\nStatements: {statements}'
        eval_result = self._execute_with_retries(
            query2, self._eval_llm, self._validate_eval_result, self._post_processor)
        if not self._validate_eval_result(eval_result):
            lazyllm.LOG.error('Invalid evaluation result format')
            res.update({'scores': [], 'final_score': 0.0})
            return res

        total_score = sum(
            int(entry.get('score', 0)) if entry.get('score') in (0, 1) else 0
            for entry in eval_result
        )
        res['scores'] = eval_result
        res['final_score'] = round(total_score / len(eval_result), 4) if eval_result else 0.0
        return res

`lazyllm.tools.LLMContextRecall`

Bases: BaseEvaluator

Evaluator that measures whether each sentence in the answer can be attributed to the retrieved context.

This module uses a language model to analyze the factual alignment between each statement in the answer and the provided context. It scores each sentence with binary values (1 = supported, 0 = unsupported/contradictory) and computes an average recall score.

Parameters:

llm (ModuleBase) –

A language model capable of evaluating answer-context consistency.
eval_prompt (str, default: None ) –

Custom prompt used to instruct the evaluator model.
prompt_lang (str, default: 'en' ) –

Language of the default prompt. Choose 'en' for English or 'zh' for Chinese.
retry (int, default: 3 ) –

Number of retry attempts if the evaluation fails.
concurrency (int, default: 1 ) –

Number of parallel evaluations to perform concurrently.

Examples:

>>> from lazyllm.components import LLMContextRecall
>>> evaluator = LLMContextRecall(llm=YourLLM(), prompt_lang="en")
>>> data = {
...     "question": "What is Photosynthesis?",
...     "answer": "Photosynthesis was discovered in the 1780s. It occurs in chloroplasts.",
...     "context_retrieved": [
...         "Photosynthesis occurs in chloroplasts.",
...         "Light reactions produce ATP using sunlight."
...     ]
... }
>>> result = evaluator([data])
>>> print(result)
... 0.5  # Final recall score averaged over statement evaluations

Source code in lazyllm/tools/eval/rag_retriever_metrics.py

class LLMContextRecall(BaseEvaluator):
    """Evaluator that measures whether each sentence in the answer can be attributed to the retrieved context.

This module uses a language model to analyze the factual alignment between each statement in the answer and the provided context. It scores each sentence with binary values (1 = supported, 0 = unsupported/contradictory) and computes an average recall score.


Args:
    llm (ModuleBase): A language model capable of evaluating answer-context consistency.
    eval_prompt (str, optional): Custom prompt used to instruct the evaluator model.
    prompt_lang (str): Language of the default prompt. Choose 'en' for English or 'zh' for Chinese.
    retry (int): Number of retry attempts if the evaluation fails.
    concurrency (int): Number of parallel evaluations to perform concurrently.


Examples:
    >>> from lazyllm.components import LLMContextRecall
    >>> evaluator = LLMContextRecall(llm=YourLLM(), prompt_lang="en")
    >>> data = {
    ...     "question": "What is Photosynthesis?",
    ...     "answer": "Photosynthesis was discovered in the 1780s. It occurs in chloroplasts.",
    ...     "context_retrieved": [
    ...         "Photosynthesis occurs in chloroplasts.",
    ...         "Light reactions produce ATP using sunlight."
    ...     ]
    ... }
    >>> result = evaluator([data])
    >>> print(result)
    ... 0.5  # Final recall score averaged over statement evaluations
    """
    _default_eval_prompt_en = (
        '[Task Description]\n'
        'Given a context, and an answer, analyze each sentence in the answer and '
        'classify if the sentence can be attributed to the given context or not:\n'
        'Fully supported by context: 1\n'
        'Unsupported/contradictory: 0\n'
        '[Output Requirements]\n'
        '1. JSON format with array of objects\n'
        '2. Each object contains:\n'
        '    - "statement": Original text\n'
        '    - "reason": the reason why it is scored 1/0\n'
        '    - "score": 1 or 0\n'
        '3. Wrap output in ```json code block\n'
        '[Example Input]\n'
        'Question: What is Photosynthesis?'
        'Context: Photosynthesis occurs in chloroplasts. Light reactions produce ATP using sunlight.\n'
        'Statements: Photosynthesis was discovered in 1780s. It occurs in chloroplasts and produce ATP using sunlight.\n'
        '[Example Output]\n'
        '[{"statement": "Photosynthesis was discovered in 1780s", '
        '"reason": "The time when photosynthesis discovered was not mentioned in the given context","score": 0},'
        ' {"statement": "It occurs in chloroplasts and produce ATP using sunlight.", '
        '"reason": "The exact sentence is present in the given context", "score": 1}]\n'
    )
    _default_eval_prompt_zh = (
        '[任务描述]\n'
        '给定一个上下文和一个答案，分析答案中的每个句子并判断该句子是否可以归因于给定的上下文:\n'
        '完全受上下文支持:1\n'
        '不支持/矛盾:0\n'
        '[输出要求]\n'
        '1. 带有对象数组的 JSON 格式\n'
        '2. 每个对象包含:\n'
        ' - "statement":原始文本\n'
        ' - "reason":评分原因\n'
        ' - "score":1 或 0\n'
        '3. 将输出包裹在 ```json 代码块中\n'
        '[示例输入]\n'
        'question:什么是光合作用？'
        'context:光合作用发生在叶绿体中，利用阳光产生 ATP。\n'
        'statement:光合作用于 1780 年代被发现。光合作用发生在叶绿体中，并利用阳光产生 ATP。\n'
        '[示例输出]\n'
        '[{"statement": "光合作用于 1780 年代被发现", "reason": "给定上下文中未提及发现光合作用被发现的时间","score": 0},'
        ' {"statement": "光合作用发生在叶绿体中，并利用阳光产生 ATP。", "reason": "给定上下文中存在确切的句子", "score": 1}]\n'
    )

    def __init__(self, llm, eval_prompt=None, prompt_lang='en', retry=3, concurrency=1):
        super().__init__(concurrency, retry)
        if prompt_lang == 'zh':
            default_eval_prompt = eval_prompt or self._default_eval_prompt_zh
        else:
            default_eval_prompt = eval_prompt or self._default_eval_prompt_en
        self._llm = llm.prompt(default_eval_prompt).formatter(JsonFormatter()) if llm else None
        self._necessary_keys = ['question', 'answer', 'context_retrieved']

    def _validate_eval_result(self, result):
        return (
            isinstance(result, list)
            and len(result) > 0
            and all(isinstance(i, dict) and 'score' in i for i in result)
        )

    def _post_processor(self, eval_result):
        if isinstance(eval_result, dict):
            eval_result = [eval_result]
        return eval_result

    def _process_one_data_impl(self, data):
        res = copy.deepcopy(data)
        context = '\n'.join(data['context_retrieved'])

        query = f'question: {data["question"]}\ncontext: {context}\nstatement: {data["answer"]}'
        eval_result = self._execute_with_retries(
            query, self._llm, self._validate_eval_result, self._post_processor)
        scores = [result['score'] for result in eval_result]

        res['final_score'] = round(sum(scores) / len(scores), 4) if scores else 0.0
        return res

`lazyllm.tools.NonLLMContextRecall`

Bases: BaseEvaluator

A non-LLM evaluator that measures whether retrieved contexts match the reference context using fuzzy string matching.

This module compares each retrieved context against a reference using Levenshtein distance and computes a recall score. It can return binary scores (whether any retrieved context is similar enough) or an averaged similarity score.

Parameters:

th (float, default: 0.5 ) –

Similarity threshold (between 0 and 1). A higher value means stricter matching.
binary (bool, default: True ) –

If True, output is binary (1 if any match exceeds threshold), otherwise returns average match score.
retry (int, default: 3 ) –

Number of retries for evaluation in case of failure.
concurrency (int, default: 1 ) –

Number of parallel evaluations to run.

Examples:

>>> from lazyllm.components import NonLLMContextRecall
>>> evaluator = NonLLMContextRecall(th=0.8, binary=True)
>>> data = {
...     "context_retrieved": [
...         "Photosynthesis uses sunlight to produce sugar.",
...         "It takes place in chloroplasts."
...     ],
...     "context_reference": [
...         "Photosynthesis occurs in chloroplasts."
...     ]
... }
>>> result = evaluator([data])
>>> print(result)
... 1.0  # At least one retrieved context is similar enough

Source code in lazyllm/tools/eval/rag_retriever_metrics.py

class NonLLMContextRecall(BaseEvaluator):
    """A non-LLM evaluator that measures whether retrieved contexts match the reference context using fuzzy string matching.

This module compares each retrieved context against a reference using Levenshtein distance and computes a recall score. It can return binary scores (whether any retrieved context is similar enough) or an averaged similarity score.

Args:
    th (float): Similarity threshold (between 0 and 1). A higher value means stricter matching.
    binary (bool): If True, output is binary (1 if any match exceeds threshold), otherwise returns average match score.
    retry (int): Number of retries for evaluation in case of failure.
    concurrency (int): Number of parallel evaluations to run.


Examples:
    >>> from lazyllm.components import NonLLMContextRecall
    >>> evaluator = NonLLMContextRecall(th=0.8, binary=True)
    >>> data = {
    ...     "context_retrieved": [
    ...         "Photosynthesis uses sunlight to produce sugar.",
    ...         "It takes place in chloroplasts."
    ...     ],
    ...     "context_reference": [
    ...         "Photosynthesis occurs in chloroplasts."
    ...     ]
    ... }
    >>> result = evaluator([data])
    >>> print(result)
    ... 1.0  # At least one retrieved context is similar enough
    """
    def __init__(self, th=0.5, binary=True, retry=3, concurrency=1):
        super().__init__(concurrency, retry)
        self._binary = binary
        self._threshold = th
        self._necessary_keys = ['context_retrieved', 'context_reference']

    def _calc_levenshtein_distance(self, reference, context):
        return 1 - rapidfuzz.distance.Levenshtein.normalized_distance(reference, context)

    def _calc_context_recall(self, data):
        contexts, reference = data['context'], data['reference']
        scores = []
        for context in contexts:
            score = self._calc_levenshtein_distance(reference, context)
            scores.append(score)
        return scores

    def _compute_scores(self, scores):
        binary_scores = [1 if score > self._threshold else 0 for score in scores]

        if self._binary:
            return 1.0 if sum(binary_scores) > 0 else 0.0
        if len(binary_scores) > 0:
            return sum(binary_scores) / len(binary_scores)
        return 0

    def _process_one_data_impl(self, data):
        res = copy.deepcopy(data)
        scores = []
        for reference in data['context_reference']:
            input_data = {'context': data['context_retrieved'], 'reference': reference}
            eval_result = self._execute_with_retries(input_data, self._calc_context_recall)
            scores.append(self._compute_scores(eval_result))

        res['final_score'] = round(sum(scores) / len(scores), 4) if scores else 0.0
        return res

`lazyllm.tools.ContextRelevance`

Bases: BaseEvaluator

A non-LLM evaluator that measures the overlap between retrieved and reference contexts at the sentence level.

This evaluator splits both retrieved and reference contexts into sentences, then counts how many retrieved sentences exactly match those in the reference. It outputs a relevance score as the fraction of overlapping sentences.

Parameters:

splitter (str, default: '。' ) –

Sentence splitter. Default is '。' for Chinese. Use '.' for English contexts.
retry (int, default: 3 ) –

Number of retries for evaluation in case of failure.
concurrency (int, default: 1 ) –

Number of parallel evaluations to run.

Examples:

>>> from lazyllm.components import ContextRelevance
>>> evaluator = ContextRelevance(splitter='.')
>>> data = {
...     "context_retrieved": [
...         "Photosynthesis occurs in chloroplasts. It produces glucose."
...     ],
...     "context_reference": [
...         "Photosynthesis occurs in chloroplasts. It requires sunlight. It produces glucose."
...     ]
... }
>>> result = evaluator([data])
>>> print(result)
... 0.6667  # 2 of 3 retrieved sentences match

Source code in lazyllm/tools/eval/rag_retriever_metrics.py

class ContextRelevance(BaseEvaluator):
    """A non-LLM evaluator that measures the overlap between retrieved and reference contexts at the sentence level.

This evaluator splits both retrieved and reference contexts into sentences, then counts how many retrieved sentences exactly match those in the reference. It outputs a relevance score as the fraction of overlapping sentences.


Args:
    splitter (str): Sentence splitter. Default is '。' for Chinese. Use '.' for English contexts.
    retry (int): Number of retries for evaluation in case of failure.
    concurrency (int): Number of parallel evaluations to run.


Examples:
    >>> from lazyllm.components import ContextRelevance
    >>> evaluator = ContextRelevance(splitter='.')
    >>> data = {
    ...     "context_retrieved": [
    ...         "Photosynthesis occurs in chloroplasts. It produces glucose."
    ...     ],
    ...     "context_reference": [
    ...         "Photosynthesis occurs in chloroplasts. It requires sunlight. It produces glucose."
    ...     ]
    ... }
    >>> result = evaluator([data])
    >>> print(result)
    ... 0.6667  # 2 of 3 retrieved sentences match
    """
    def __init__(self, splitter='。', retry=3, concurrency=1):
        super().__init__(concurrency, retry)
        self._splitter = splitter
        self._necessary_keys = ['context_retrieved', 'context_reference']

    def _calc_context_relevance(self, data):
        sentences_retrieved, sentences_reference = data['context'], data['reference']
        scores = [0] * len(sentences_retrieved)
        for i, sentence in enumerate(sentences_retrieved):
            if sentence in sentences_reference:
                scores[i] = 1
        return scores

    def _paragraphs_to_sentences(self, paragraphs):
        sentences = []
        pattern = rf'{re.escape(self._splitter)}+'
        for paragraph in paragraphs:
            sentences.extend([s.strip() for s in re.split(pattern, paragraph) if s.strip()])
        return sentences

    def _process_one_data_impl(self, data):
        res = copy.deepcopy(data)
        retrieved = self._paragraphs_to_sentences(data['context_retrieved'])
        reference = self._paragraphs_to_sentences(data['context_reference'])

        input_data = {'context': retrieved, 'reference': reference}
        eval_result = self._execute_with_retries(input_data, self._calc_context_relevance)
        total_score = sum(eval_result)

        res['final_score'] = round(total_score / len(eval_result), 4) if eval_result else 0.0
        return res

`lazyllm.tools.HttpRequest`

Bases: ModuleBase

General HTTP request executor.

This class builds and sends HTTP requests with support for dynamic variable substitution, API key injection, JSON or form data encoding, and file-aware response parsing.

Parameters:

method (str) –

HTTP method, such as 'GET', 'POST', etc.
url (str) –

The target URL for the HTTP request.
api_key (str) –

Optional API key, inserted into query parameters.
headers (dict) –

HTTP request headers.
params (dict) –

URL query parameters.
body (Union[str, dict]) –

HTTP request body (raw string or JSON-formatted dict).
timeout (int, default: 10 ) –

Timeout duration for the request (in seconds).
proxies (dict, default: None ) –

Proxy settings for the request, if needed.

Examples:

>>> from lazyllm.components import HttpRequest
>>> request = HttpRequest(
...     method="GET",
...     url="https://api.github.com/repos/openai/openai-python",
...     api_key="",
...     headers={"Accept": "application/json"},
...     params={},
...     body=None
... )
>>> result = request()
>>> print(result["status_code"])
... 200
>>> print(result["content"][:100])
... '{"id":123456,"name":"openai-python", ...}'

Source code in lazyllm/tools/http_request/http_request.py

class HttpRequest(ModuleBase):
    """General HTTP request executor.

This class builds and sends HTTP requests with support for dynamic variable substitution, API key injection, JSON or form data encoding, and file-aware response parsing.

Args:
    method (str): HTTP method, such as 'GET', 'POST', etc.
    url (str): The target URL for the HTTP request.
    api_key (str): Optional API key, inserted into query parameters.
    headers (dict): HTTP request headers.
    params (dict): URL query parameters.
    body (Union[str, dict]): HTTP request body (raw string or JSON-formatted dict).
    timeout (int): Timeout duration for the request (in seconds).
    proxies (dict, optional): Proxy settings for the request, if needed.


Examples:
    >>> from lazyllm.components import HttpRequest
    >>> request = HttpRequest(
    ...     method="GET",
    ...     url="https://api.github.com/repos/openai/openai-python",
    ...     api_key="",
    ...     headers={"Accept": "application/json"},
    ...     params={},
    ...     body=None
    ... )
    >>> result = request()
    >>> print(result["status_code"])
    ... 200
    >>> print(result["content"][:100])
    ... '{"id":123456,"name":"openai-python", ...}'
    """
    def __init__(self, method, url, api_key, headers, params, body, timeout=10, proxies=None):
        super().__init__()
        if not url:
            return

        self._method = method
        self._url = url
        self._api_key = api_key
        self._headers = headers
        self._params = params
        self._body = body
        self._timeout = timeout
        self._proxies = proxies

    def _process_api_key(self, headers, params):
        if self._api_key and self._api_key != '':
            params = params or {}
            params['api_key'] = self._api_key
        return headers, params

    def forward(self, *args, **kwargs):
        def _map_input(target_str):
            if not isinstance(target_str, str):
                return target_str

            # TODO: replacements could be more complex to create.
            replacements = {**kwargs, **(args[0] if args and isinstance(args[0], dict) else {})}
            if not replacements:
                return target_str

            pattern = r'\{\{([^}]+)\}\}'

            full_match = re.fullmatch(pattern, target_str)
            if full_match:
                key = full_match.group(1)
                if key in replacements:
                    return replacements[key]

            def replacer(m):
                key = m.group(1)
                if key not in replacements:
                    return m.group(0)  # Keep original if no replacement found
                replacement = replacements[key]
                if isinstance(replacement, (dict, list, bool)) or replacement is None:
                    return json.dumps(replacement, ensure_ascii=False)
                return str(replacement)

            return re.sub(pattern, replacer, target_str)

        url = _map_input(self._url)
        params = {key: _map_input(value) for key, value in self._params.items()} if self._params else None
        headers = {key: _map_input(value) for key, value in self._headers.items()} if self._headers else None
        headers, params = self._process_api_key(headers, params)
        if isinstance(headers, dict) and headers.get('Content-Type') == 'application/json':
            try:
                body = json.dumps(self._body) if isinstance(self._body, dict) else self._body
                body = json.loads(_map_input(body))

                http_response = httpx.request(method=self._method, url=url, headers=headers,
                                              params=params, json=body, timeout=self._timeout,
                                              proxies=self._proxies)
            except json.JSONDecodeError:
                raise ValueError(f'Invalid JSON format: {self._body}')
        else:
            body = (json.dumps({k: _map_input(v) for k, v in self._body.items()})
                    if isinstance(self._body, dict) else _map_input(self._body))

            http_response = httpx.request(method=self._method, url=url, headers=headers,
                                          params=params, data=body, timeout=self._timeout,
                                          proxies=self._proxies)

        response = HttpExecutorResponse(http_response)

        _, file_binary = response.extract_file()

        outputs = {
            'status_code': response.status_code,
            'content': response.content if len(file_binary) == 0 else None,
            'headers': response.headers,
            'file': file_binary
        }
        return outputs

`lazyllm.tools.DBManager`

Bases: ABC, ModuleBase

Abstract base class for database managers.

This class defines the standard interface and helpers for building database connectors, including a required execute_query method and description property.

Parameters:

db_type (str) –

Type identifier of the database (e.g., 'mysql', 'mongodb').

Examples:

>>> from lazyllm.components import DBManager
>>> class DummyDB(DBManager):
...     def __init__(self):
...         super().__init__(db_type="dummy")
...     def execute_query(self, statement):
...         return f"Executed: {statement}"
...     @property
...     def desc(self):
...         return "Dummy database for testing."
>>> db = DummyDB()
>>> print(db("SELECT * FROM test"))
... Executed: SELECT * FROM test

Source code in lazyllm/tools/sql/db_manager.py

class DBManager(ABC, ModuleBase, metaclass=CommonMeta):
    """Abstract base class for database managers.

This class defines the standard interface and helpers for building database connectors, including a required `execute_query` method and description property.

Args:
    db_type (str): Type identifier of the database (e.g., 'mysql', 'mongodb').


Examples:
    >>> from lazyllm.components import DBManager
    >>> class DummyDB(DBManager):
    ...     def __init__(self):
    ...         super().__init__(db_type="dummy")
    ...     def execute_query(self, statement):
    ...         return f"Executed: {statement}"
    ...     @property
    ...     def desc(self):
    ...         return "Dummy database for testing."
    >>> db = DummyDB()
    >>> print(db("SELECT * FROM test"))
    ... Executed: SELECT * FROM test
    """

    def __init__(self, db_type: str):
        ModuleBase.__init__(self)
        self._db_type = db_type
        self._desc = None

    @abstractmethod
    def execute_query(self, statement) -> str:
        """Abstract method for executing database query statements. This method needs to be implemented by specific database manager subclasses to execute various database operations.

Args:
    statement: The database query statement to execute, which can be SQL statements or other database-specific query languages

Features of this method:

- **Abstract Method**: Requires implementation of specific database operation logic in subclasses
- **Unified Interface**: Provides a unified query interface for different database types
- **Error Handling**: Subclass implementations should include appropriate error handling and status reporting
- **Result Formatting**: Returns formatted string results for subsequent processing

**Note**: This method is the core method of the database manager, and all specific database operations are executed through this method.

"""
        pass

    def forward(self, statement: str) -> str:
        return self.execute_query(statement)

    @property
    def db_type(self) -> str:
        return self._db_type

    @property
    @abstractmethod
    def desc(self) -> str: pass

    @staticmethod
    def _is_dict_all_str(d):
        if not isinstance(d, dict):
            return False
        return all(isinstance(key, str) and (isinstance(value, str) or DBManager._is_dict_all_str(value))
                   for key, value in d.items())

    @staticmethod
    def _serialize_uncommon_type(obj):
        if not isinstance(obj, (int, str, float, bool, tuple, list, dict)):
            return str(obj)

`execute_query(statement)` `abstractmethod`

Abstract method for executing database query statements. This method needs to be implemented by specific database manager subclasses to execute various database operations.

Parameters:

statement –

The database query statement to execute, which can be SQL statements or other database-specific query languages

Features of this method:

Abstract Method: Requires implementation of specific database operation logic in subclasses
Unified Interface: Provides a unified query interface for different database types
Error Handling: Subclass implementations should include appropriate error handling and status reporting
Result Formatting: Returns formatted string results for subsequent processing

Note: This method is the core method of the database manager, and all specific database operations are executed through this method.

Source code in lazyllm/tools/sql/db_manager.py

    @abstractmethod
    def execute_query(self, statement) -> str:
        """Abstract method for executing database query statements. This method needs to be implemented by specific database manager subclasses to execute various database operations.

Args:
    statement: The database query statement to execute, which can be SQL statements or other database-specific query languages

Features of this method:

- **Abstract Method**: Requires implementation of specific database operation logic in subclasses
- **Unified Interface**: Provides a unified query interface for different database types
- **Error Handling**: Subclass implementations should include appropriate error handling and status reporting
- **Result Formatting**: Returns formatted string results for subsequent processing

**Note**: This method is the core method of the database manager, and all specific database operations are executed through this method.

"""
        pass

`lazyllm.tools.MongoDBManager`

Bases: DBManager

MongoDBManager is a specialized tool for interacting with MongoB databases. It provides methods to check the connection, obtain the database connection object, and execute query.

Parameters:

user (str) –

MongoDB username password (str): MongoDB password host (str): MongoDB server address port (int): MongoDB server port db_name (str): Database name collection_name (str): Collection name **kwargs: Additional configuration parameters including: - options_str (str): Connection options string - collection_desc_dict (dict): Collection description dictionary

Examples:

>>> from lazyllm.components import MongoDBManager
>>> mgr = MongoDBManager(
...     user="admin",
...     password="123456",
...     host="localhost",
...     port=27017,
...     db_name="mydb",
...     collection_name="books"
... )
>>> result = mgr.execute_query('[{"$match": {"author": "Tolstoy"}}]')
>>> print(result)
... '[{"title": "War and Peace", "author": "Tolstoy"}]'

Source code in lazyllm/tools/sql/mongodb_manager.py

class MongoDBManager(DBManager):
    """MongoDBManager is a specialized tool for interacting with MongoB databases.
It provides methods to check the connection, obtain the database connection object, and execute query.

Args:
   user (str): MongoDB username
    password (str): MongoDB password
    host (str): MongoDB server address
    port (int): MongoDB server port
    db_name (str): Database name
    collection_name (str): Collection name
    **kwargs: Additional configuration parameters including:
        - options_str (str): Connection options string
        - collection_desc_dict (dict): Collection description dictionary


Examples:
    >>> from lazyllm.components import MongoDBManager
    >>> mgr = MongoDBManager(
    ...     user="admin",
    ...     password="123456",
    ...     host="localhost",
    ...     port=27017,
    ...     db_name="mydb",
    ...     collection_name="books"
    ... )
    >>> result = mgr.execute_query('[{"$match": {"author": "Tolstoy"}}]')
    >>> print(result)
    ... '[{"title": "War and Peace", "author": "Tolstoy"}]'
    """
    MAX_TIMEOUT_MS = 5000

    def __init__(self, user: str, password: str, host: str, port: int, db_name: str, collection_name: str, **kwargs):
        super().__init__(db_type='mongodb')
        self._user = user
        self._password = password
        self._host = host
        self._port = port
        self._db_name = db_name
        self._collection_name = collection_name
        self._collection = None
        self._options_str = kwargs.get('options_str')
        self._conn_url = self._gen_conn_url()
        self._collection_desc_dict = kwargs.get('collection_desc_dict')

    @property
    def db_name(self):
        return self._db_name

    @property
    def collection_name(self):
        return self._collection_name

    def _gen_conn_url(self) -> str:
        password = quote_plus(self._password)
        conn_url = (f'{self._db_type}://{self._user}:{password}@{self._host}:{self._port}/'
                    f'{("?" + self._options_str) if self._options_str else ""}')
        return conn_url

    @contextmanager
    def get_client(self):
        """This is a context manager that creates a database session, yields it for use, and closes the session when done.
Usage example:

with mongodb_manager.get_client() as client:
    all_dbs = client.list_database_names()

**Returns:**

- pymongo.MongoClient: MongoDB client used to connect to MongoDB database
"""
        client = pymongo.MongoClient(self._conn_url, serverSelectionTimeoutMS=self.MAX_TIMEOUT_MS)
        try:
            yield client
        finally:
            client.close()

    @property
    def desc(self):
        if self._desc is None:
            self.set_desc(schema_desc_dict=self._collection_desc_dict)
        return self._desc

    def set_desc(self, schema_desc_dict: dict):
        """When using MongoDBManager with LLM to query documents in natural language, set descriptions for the necessary keywords. Note that all relevant keywords needed for queries must be provided because MongoDB cannot obtain like structural information like a SQL database.

Args:
    tables_desc_dict (dict): descriptive comment for documents
"""
        self._collection_desc_dict = schema_desc_dict
        if schema_desc_dict is None:
            with self.get_client() as client:
                egs_one = client[self._db_name][self._collection_name].find_one()
                if egs_one is not None:
                    self._desc = 'Collection Example:\n'
                    self._desc += json.dumps(egs_one, ensure_ascii=False, indent=4)
        else:
            self._desc = ''
            try:
                collection_desc = CollectionDesc.model_validate(schema_desc_dict)
            except pydantic.ValidationError as e:
                raise ValueError(f'Validate input schema_desc_dict failed: {str(e)}')
            if not self._is_dict_all_str(collection_desc.schema_type):
                raise ValueError('schema_type shouble be str or nested str dict')
            if not self._is_dict_all_str(collection_desc.schema_desc):
                raise ValueError('schema_desc shouble be str or nested str dict')
            if collection_desc.summary:
                self._desc += f'Collection summary: {collection_desc.summary}\n'
            self._desc += 'Collection schema:\n'
            self._desc += json.dumps(collection_desc.schema_type, ensure_ascii=False, indent=4)
            self._desc += 'Collection schema description:\n'
            self._desc += json.dumps(collection_desc.schema_type, ensure_ascii=False, indent=4)

    def check_connection(self) -> DBResult:
        """Check the current connection status of the MongoDBManager.

**Returns:**

- DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.
"""
        try:
            with pymongo.MongoClient(self._conn_url, serverSelectionTimeoutMS=self.MAX_TIMEOUT_MS) as client:
                _ = client.server_info()
            return DBResult()
        except Exception as e:
            return DBResult(status=DBStatus.FAIL, detail=str(e))

    def execute_query(self, statement) -> str:
        str_result = ''
        try:
            pipeline_list = json.loads(statement)
            with self.get_client() as client:
                collection = client[self._db_name][self._collection_name]
                result = list(collection.aggregate(pipeline_list))
                str_result = json.dumps(result, ensure_ascii=False, default=self._serialize_uncommon_type)
        except Exception as e:
            str_result = f'MongoDB ERROR: {str(e)}'
        return str_result

`check_connection()`

Check the current connection status of the MongoDBManager.

Returns:

DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.

Source code in lazyllm/tools/sql/mongodb_manager.py

    def check_connection(self) -> DBResult:
        """Check the current connection status of the MongoDBManager.

**Returns:**

- DBResult: DBResult.status True if the connection is successful, False if it fails. DBResult.detail contains failure information.
"""
        try:
            with pymongo.MongoClient(self._conn_url, serverSelectionTimeoutMS=self.MAX_TIMEOUT_MS) as client:
                _ = client.server_info()
            return DBResult()
        except Exception as e:
            return DBResult(status=DBStatus.FAIL, detail=str(e))

`get_client()`

This is a context manager that creates a database session, yields it for use, and closes the session when done. Usage example:

with mongodb_manager.get_client() as client: all_dbs = client.list_database_names()

Returns:

pymongo.MongoClient: MongoDB client used to connect to MongoDB database

Source code in lazyllm/tools/sql/mongodb_manager.py

    @contextmanager
    def get_client(self):
        """This is a context manager that creates a database session, yields it for use, and closes the session when done.
Usage example:

with mongodb_manager.get_client() as client:
    all_dbs = client.list_database_names()

**Returns:**

- pymongo.MongoClient: MongoDB client used to connect to MongoDB database
"""
        client = pymongo.MongoClient(self._conn_url, serverSelectionTimeoutMS=self.MAX_TIMEOUT_MS)
        try:
            yield client
        finally:
            client.close()

`set_desc(schema_desc_dict)`

When using MongoDBManager with LLM to query documents in natural language, set descriptions for the necessary keywords. Note that all relevant keywords needed for queries must be provided because MongoDB cannot obtain like structural information like a SQL database.

Parameters:

tables_desc_dict (dict) –

descriptive comment for documents

Source code in lazyllm/tools/sql/mongodb_manager.py

    def set_desc(self, schema_desc_dict: dict):
        """When using MongoDBManager with LLM to query documents in natural language, set descriptions for the necessary keywords. Note that all relevant keywords needed for queries must be provided because MongoDB cannot obtain like structural information like a SQL database.

Args:
    tables_desc_dict (dict): descriptive comment for documents
"""
        self._collection_desc_dict = schema_desc_dict
        if schema_desc_dict is None:
            with self.get_client() as client:
                egs_one = client[self._db_name][self._collection_name].find_one()
                if egs_one is not None:
                    self._desc = 'Collection Example:\n'
                    self._desc += json.dumps(egs_one, ensure_ascii=False, indent=4)
        else:
            self._desc = ''
            try:
                collection_desc = CollectionDesc.model_validate(schema_desc_dict)
            except pydantic.ValidationError as e:
                raise ValueError(f'Validate input schema_desc_dict failed: {str(e)}')
            if not self._is_dict_all_str(collection_desc.schema_type):
                raise ValueError('schema_type shouble be str or nested str dict')
            if not self._is_dict_all_str(collection_desc.schema_desc):
                raise ValueError('schema_desc shouble be str or nested str dict')
            if collection_desc.summary:
                self._desc += f'Collection summary: {collection_desc.summary}\n'
            self._desc += 'Collection schema:\n'
            self._desc += json.dumps(collection_desc.schema_type, ensure_ascii=False, indent=4)
            self._desc += 'Collection schema description:\n'
            self._desc += json.dumps(collection_desc.schema_type, ensure_ascii=False, indent=4)

`lazyllm.tools.HttpTool`

Bases: HttpRequest

Module for accessing third-party services and executing custom code. The values in params and headers, as well as in body, can include template variables marked with double curly braces like {{variable}}, which are then replaced with actual values through parameters when called. Refer to the usage instructions in [[lazyllm.tools.HttpTool.forward]].

Parameters:

method (str, default: None ) –

Specifies the HTTP request method, refer to https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods.
url (str, default: None ) –

The URL to access. If this field is empty, it indicates that the module does not need to access third-party services.
params (Dict[str, str], default: None ) –

Params fields to be filled when requesting the URL. If the URL is empty, this field will be ignored.
headers (Dict[str, str], default: None ) –

Header fields to be filled when accessing the URL. If the URL is empty, this field will be ignored.
body (Dict[str, str], default: None ) –

Body fields to be filled when requesting the URL. If the URL is empty, this field will be ignored.
timeout (int, default: 10 ) –

Request timeout in seconds, default value is 10.
proxies (Dict[str, str], default: None ) –

Specifies the proxies to be used when requesting the URL. Proxy format refer to https://www.python-httpx.org/advanced/proxies.
code_str (str, default: None ) –

A string containing a user-defined function. If the parameter url is empty, execute this function directly, forwarding all arguments to it; if url is not empty, the parameters of this function are the results returned from the URL request, and in this case, the function serves as a post-processing function for the URL response.
vars_for_code (Dict[str, Any], default: None ) –

A dictionary that includes dependencies and variables required for running the code.
outputs (Optional[List[str]], default: None ) –

Names of expected output fields.
extract_from_result (Optional[bool], default: None ) –

Whether to extract fields directly from response dict using outputs.

Examples:

from lazyllm.tools import HttpTool

code_str = "def identity(content): return content"
tool = HttpTool(method='GET', url='http://www.sensetime.com/', code_str=code_str)
ret = tool()

Source code in lazyllm/tools/tools/http_tool.py

class HttpTool(HttpRequest):
    """
Module for accessing third-party services and executing custom code. The values in `params` and `headers`, as well as in body, can include template variables marked with double curly braces like `{{variable}}`, which are then replaced with actual values through parameters when called. Refer to the usage instructions in [[lazyllm.tools.HttpTool.forward]].

Args:
    method (str, optional): Specifies the HTTP request method, refer to `https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods`.
    url (str, optional): The URL to access. If this field is empty, it indicates that the module does not need to access third-party services.
    params (Dict[str, str], optional): Params fields to be filled when requesting the URL. If the URL is empty, this field will be ignored.
    headers (Dict[str, str], optional): Header fields to be filled when accessing the URL. If the URL is empty, this field will be ignored.
    body (Dict[str, str], optional): Body fields to be filled when requesting the URL. If the URL is empty, this field will be ignored.
    timeout (int): Request timeout in seconds, default value is 10.
    proxies (Dict[str, str], optional): Specifies the proxies to be used when requesting the URL. Proxy format refer to `https://www.python-httpx.org/advanced/proxies`.
    code_str (str, optional): A string containing a user-defined function. If the parameter url is empty, execute this function directly, forwarding all arguments to it; if url is not empty, the parameters of this function are the results returned from the URL request, and in this case, the function serves as a post-processing function for the URL response.
    vars_for_code (Dict[str, Any]): A dictionary that includes dependencies and variables required for running the code.
    outputs (Optional[List[str]]): Names of expected output fields.
    extract_from_result (Optional[bool]): Whether to extract fields directly from response dict using `outputs`.


Examples:

    from lazyllm.tools import HttpTool

    code_str = "def identity(content): return content"
    tool = HttpTool(method='GET', url='http://www.sensetime.com/', code_str=code_str)
    ret = tool()
    """
    def __init__(self,
                 method: Optional[str] = None,
                 url: Optional[str] = None,
                 params: Optional[Dict[str, str]] = None,
                 headers: Optional[Dict[str, str]] = None,
                 body: Optional[str] = None,
                 timeout: int = 10,
                 proxies: Optional[Dict[str, str]] = None,
                 code_str: Optional[str] = None,
                 vars_for_code: Optional[Dict[str, Any]] = None,
                 outputs: Optional[List[str]] = None,
                 extract_from_result: Optional[bool] = None):
        super().__init__(method, url, '', headers, params, body, timeout, proxies)
        self._has_http = True if url else False
        self._compiled_func = (compile_func(code_str, vars_for_code) if code_str else
                               (lambda x: json.loads(x['content'])) if self._has_http else None)
        self._outputs, self._extract_from_result = outputs, extract_from_result
        if extract_from_result:
            assert outputs, 'Output information is necessary to extract output parameters'
            assert len(outputs) == 1, 'When the number of outputs is greater than 1, no manual setting is required'

    def _get_result(self, res):
        if self._extract_from_result or (isinstance(res, dict) and len(self._outputs) > 1):
            assert isinstance(res, dict), 'The result of the tool should be a dict type'
            r = package(res.get(key) for key in self._outputs)
            return r[0] if len(r) == 1 else r
        if len(self._outputs) > 1:
            assert isinstance(res, (tuple, list)), 'The result of the tool should be tuple or list'
            assert len(res) == len(self._outputs), 'The number of outputs is inconsistent with expectations'
            return package(res)
        return res

    def forward(self, *args, **kwargs):
        """
Used to perform operations specified during initialization: request the specified URL or execute the passed function. Generally not called directly, but through the base class's `__call__`. If the `url` parameter in the constructor is not empty, all passed parameters will be used as variables to replace template parameters marked with `{{}}` in the constructor; if the `url` parameter in the constructor is empty and `code_str` is not empty, all passed parameters will be used as arguments for the function defined in `code_str`.


Examples:

    from lazyllm.tools import HttpTool

    code_str = "def exp(v, n): return v ** n"
    tool = HttpTool(code_str=code_str)
    assert tool(v=10, n=2) == 100
    """
        if not self._compiled_func: return None
        if self._has_http:
            res = super().forward(*args, **kwargs)
            if int(res['status_code']) >= 400:
                raise RuntimeError(f'HttpRequest error, status code is {res["status_code"]}.')
            args, kwargs = (res,), {}
        res = self._compiled_func(*args, **kwargs)
        return self._get_result(res) if self._outputs else res

`forward(*args, **kwargs)`

Used to perform operations specified during initialization: request the specified URL or execute the passed function. Generally not called directly, but through the base class's __call__. If the url parameter in the constructor is not empty, all passed parameters will be used as variables to replace template parameters marked with {{}} in the constructor; if the url parameter in the constructor is empty and code_str is not empty, all passed parameters will be used as arguments for the function defined in code_str.

Examples:

from lazyllm.tools import HttpTool

code_str = "def exp(v, n): return v ** n"
tool = HttpTool(code_str=code_str)
assert tool(v=10, n=2) == 100

Source code in lazyllm/tools/tools/http_tool.py

    def forward(self, *args, **kwargs):
        """
Used to perform operations specified during initialization: request the specified URL or execute the passed function. Generally not called directly, but through the base class's `__call__`. If the `url` parameter in the constructor is not empty, all passed parameters will be used as variables to replace template parameters marked with `{{}}` in the constructor; if the `url` parameter in the constructor is empty and `code_str` is not empty, all passed parameters will be used as arguments for the function defined in `code_str`.


Examples:

    from lazyllm.tools import HttpTool

    code_str = "def exp(v, n): return v ** n"
    tool = HttpTool(code_str=code_str)
    assert tool(v=10, n=2) == 100
    """
        if not self._compiled_func: return None
        if self._has_http:
            res = super().forward(*args, **kwargs)
            if int(res['status_code']) >= 400:
                raise RuntimeError(f'HttpRequest error, status code is {res["status_code"]}.')
            args, kwargs = (res,), {}
        res = self._compiled_func(*args, **kwargs)
        return self._get_result(res) if self._outputs else res

`lazyllm.tools.agent.functionCall.StreamResponse`

StreamResponse class encapsulates streaming output behavior with configurable prefix and colors. When streaming is enabled, calling the instance enqueues colored text to a filesystem queue for asynchronous processing or display.

Parameters:

prefix (str) –

Prefix text before the output, typically used to indicate the source or category.
prefix_color (Optional[str], default: None ) –

Color of the prefix text, supports terminal color codes, defaults to None.
color (Optional[str], default: None ) –

Color of the main content text, supports terminal color codes, defaults to None.
stream (bool, default: False ) –

Whether to enable streaming output mode, which enqueues text to the filesystem queue, defaults to False.

Examples:

>>> from lazyllm.tools.agent.functionCall import StreamResponse
>>> resp = StreamResponse(prefix="[INFO]", prefix_color="green", color="white", stream=True)
>>> resp("Hello, world!")
Hello, world!

Source code in lazyllm/tools/agent/functionCall.py

class StreamResponse():
    """StreamResponse class encapsulates streaming output behavior with configurable prefix and colors.
When streaming is enabled, calling the instance enqueues colored text to a filesystem queue for asynchronous processing or display.

Args:
    prefix (str): Prefix text before the output, typically used to indicate the source or category.
    prefix_color (Optional[str]): Color of the prefix text, supports terminal color codes, defaults to None.
    color (Optional[str]): Color of the main content text, supports terminal color codes, defaults to None.
    stream (bool): Whether to enable streaming output mode, which enqueues text to the filesystem queue, defaults to False.


Examples:
    >>> from lazyllm.tools.agent.functionCall import StreamResponse
    >>> resp = StreamResponse(prefix="[INFO]", prefix_color="green", color="white", stream=True)
    >>> resp("Hello, world!")
    Hello, world!
    """
    def __init__(self, prefix: str, prefix_color: str = None, color: str = None, stream: bool = False):
        self.stream = stream
        self.prefix = prefix
        self.prefix_color = prefix_color
        self.color = color

    def __call__(self, *inputs):
        if self.stream: FileSystemQueue().enqueue(json.dumps({'tag': 'text', 'delta': f'\n{self.prefix}\n'}))
        if len(inputs) == 1:
            if self.stream: FileSystemQueue().enqueue(json.dumps({'tag': 'text', 'delta': f'{inputs[0]}'}))
            return inputs[0]
        if self.stream: FileSystemQueue().enqueue(json.dumps({'tag': 'text', 'delta': f'{inputs}'}))
        return package(*inputs)

`lazyllm.tools.MCPClient`

Bases: object

MCP client that can be used to connect to an MCP server. It supports both local servers (through stdio client) and remote servers (through sse client).

If the 'command_or_url' is a url string (started with 'http' or 'https'), a remote server will be connected, otherwise a local server will be started and connected.

Parameters:

command_or_url (str) –

The command or url string, which will be used to start a local server or connect to a remote server.
args (list[str], default: None ) –

Arguments list used for starting a local server, if you want to connect to a remote server, this argument is not needed. (default is [])
env (dict[str, str], default: None ) –

Environment variables dictionary used in tools, for example some api keys. (default is None)
headers (dict[str, Any], default: None ) –

HTTP headers used in sse client connection. (default is None)
timeout (float, default: 5 ) –

Timeout for sse client connection, in seconds. (default is 5)

Examples:

>>> from lazyllm.tools import MCPClient
>>> mcp_server_configs = {
...     "filesystem": {
...         "command": "npx",
...         "args": [
...             "-y",
...             "@modelcontextprotocol/server-filesystem",
...             "./",
...         ]
...     }
... }
>>> file_sys_config = mcp_server_configs["filesystem"]
>>> file_client = MCPClient(
...     command_or_url=file_sys_config["command"],
...     args=file_sys_config["args"],
... )
>>> from lazyllm import OnlineChatModule
>>> from lazyllm.tools.agent.reactAgent import ReactAgent
>>> llm=OnlineChatModule(source="deepseek", stream=False)
>>> agent = ReactAgent(llm.share(), file_client.get_tools())
>>> print(agent("Write a Chinese poem about the moon, and save it to a file named 'moon.txt'."))

Source code in lazyllm/tools/mcp/client.py

class MCPClient(object):
    """MCP client that can be used to connect to an MCP server. It supports both local servers (through stdio client) and remote servers (through sse client).

If the 'command_or_url' is a url string (started with 'http' or 'https'), a remote server will be connected, otherwise a local server will be started and connected.

Args:
    command_or_url (str): The command or url string, which will be used to start a local server or connect to a remote server.
    args (list[str], optional): Arguments list used for starting a local server, if you want to connect to a remote server, this argument is not needed. (default is [])
    env (dict[str, str], optional): Environment variables dictionary used in tools, for example some api keys. (default is None)
    headers(dict[str, Any], optional): HTTP headers used in sse client connection. (default is None)
    timeout (float, optional): Timeout for sse client connection, in seconds. (default is 5)


Examples:
    >>> from lazyllm.tools import MCPClient
    >>> mcp_server_configs = {
    ...     "filesystem": {
    ...         "command": "npx",
    ...         "args": [
    ...             "-y",
    ...             "@modelcontextprotocol/server-filesystem",
    ...             "./",
    ...         ]
    ...     }
    ... }
    >>> file_sys_config = mcp_server_configs["filesystem"]
    >>> file_client = MCPClient(
    ...     command_or_url=file_sys_config["command"],
    ...     args=file_sys_config["args"],
    ... )
    >>> from lazyllm import OnlineChatModule
    >>> from lazyllm.tools.agent.reactAgent import ReactAgent
    >>> llm=OnlineChatModule(source="deepseek", stream=False)
    >>> agent = ReactAgent(llm.share(), file_client.get_tools())
    >>> print(agent("Write a Chinese poem about the moon, and save it to a file named 'moon.txt'."))
    """
    def __init__(
        self,
        command_or_url: str,
        args: Optional[list[str]] = None,
        env: dict[str, str] = None,
        headers: dict[str, Any] = None,
        timeout: float = 5,
    ):
        self._command_or_url = command_or_url
        self._args = args or []
        self._env = env
        self._headers = headers
        self._timeout = timeout

    @asynccontextmanager
    async def _run_session(self):
        if urlparse(self._command_or_url).scheme in ('http', 'https'):
            spec = importlib.util.find_spec('mcp.client.sse')
            if spec is None:
                raise ImportError(
                    'Please install mcp to use mcp module. '
                    'You can install it with `pip install mcp`'
                )
            sse_module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(sse_module)
            sse_client = sse_module.sse_client

            async with sse_client(
                url=self._command_or_url,
                headers=self._headers,
                timeout=self._timeout
            ) as streams:
                async with mcp.ClientSession(*streams) as session:
                    await session.initialize()
                    yield session
        else:
            server_parameters = mcp.StdioServerParameters(
                command=self._command_or_url, args=self._args, env=self._env
            )
            async with mcp.stdio_client(server_parameters) as streams:
                async with mcp.ClientSession(*streams) as session:
                    await session.initialize()
                    yield session

    async def call_tool(self, tool_name: str, arguments: dict):
        """Calls one of the tools provided in the toolset of the connected MCP server via the MCP client and returns the result.

Args:
    tool_name (str): The name of the tool.
    arguments (dict): The parameters for the tool.
"""
        async with self._run_session() as session:
            return await session.call_tool(tool_name, arguments)

    async def list_tools(self):
        """Retrieve the list of tools from the currently connected MCP client.

**Returns:**

- Any: The list of tools returned by the MCP client.
"""
        async with self._run_session() as session:
            return await session.list_tools()

    async def aget_tools(self, allowed_tools: list[str] = None):
        """Used to convert the tool set from the MCP server into a list of functions available for LazyLLM and return them.

The allowed_tools parameter is used to specify the list of tools to be returned. If None, all tools will be returned.

Args:
    allowed_tools (list[str], optional): The list of tools expected to be returned. Defaults to None, meaning that all tools will be returned.
"""
        res = await self.list_tools()
        mcp_tools = getattr(res, 'tools', [])
        if allowed_tools:
            mcp_tools = [tool for tool in mcp_tools if tool.name in allowed_tools]

        return [generate_lazyllm_tool(self, tool) for tool in mcp_tools]

    def get_tools(self, allowed_tools: list[str] = None):
        """Retrieve a filtered list of tools from the MCP client.

Args:
    allowed_tools (Optional[list[str]]): List of tool names to filter. If None, all tools are returned.

**Returns:**

- Any: List of tools that match the filter criteria.
"""
        return patch_sync(self.aget_tools)(allowed_tools=allowed_tools)

    async def deploy(self, sse_settings: SseServerSettings):
        """Deploys the MCP client with the specified SSE server settings asynchronously.

Args:
    sse_settings (SseServerSettings): Configuration settings for the SSE server.
"""
        async with self._run_session() as session:
            await start_sse_server(session, sse_settings)

`aget_tools(allowed_tools=None)` `async`

Used to convert the tool set from the MCP server into a list of functions available for LazyLLM and return them.

The allowed_tools parameter is used to specify the list of tools to be returned. If None, all tools will be returned.

Parameters:

allowed_tools (list[str], default: None ) –

The list of tools expected to be returned. Defaults to None, meaning that all tools will be returned.

Source code in lazyllm/tools/mcp/client.py

    async def aget_tools(self, allowed_tools: list[str] = None):
        """Used to convert the tool set from the MCP server into a list of functions available for LazyLLM and return them.

The allowed_tools parameter is used to specify the list of tools to be returned. If None, all tools will be returned.

Args:
    allowed_tools (list[str], optional): The list of tools expected to be returned. Defaults to None, meaning that all tools will be returned.
"""
        res = await self.list_tools()
        mcp_tools = getattr(res, 'tools', [])
        if allowed_tools:
            mcp_tools = [tool for tool in mcp_tools if tool.name in allowed_tools]

        return [generate_lazyllm_tool(self, tool) for tool in mcp_tools]

`call_tool(tool_name, arguments)` `async`

Calls one of the tools provided in the toolset of the connected MCP server via the MCP client and returns the result.

Parameters:

tool_name (str) –

The name of the tool.
arguments (dict) –

The parameters for the tool.

Source code in lazyllm/tools/mcp/client.py

    async def call_tool(self, tool_name: str, arguments: dict):
        """Calls one of the tools provided in the toolset of the connected MCP server via the MCP client and returns the result.

Args:
    tool_name (str): The name of the tool.
    arguments (dict): The parameters for the tool.
"""
        async with self._run_session() as session:
            return await session.call_tool(tool_name, arguments)

`deploy(sse_settings)` `async`

Deploys the MCP client with the specified SSE server settings asynchronously.

Parameters:

sse_settings (SseServerSettings) –

Configuration settings for the SSE server.

Source code in lazyllm/tools/mcp/client.py

    async def deploy(self, sse_settings: SseServerSettings):
        """Deploys the MCP client with the specified SSE server settings asynchronously.

Args:
    sse_settings (SseServerSettings): Configuration settings for the SSE server.
"""
        async with self._run_session() as session:
            await start_sse_server(session, sse_settings)

`get_tools(allowed_tools=None)`

Retrieve a filtered list of tools from the MCP client.

Parameters:

allowed_tools (Optional[list[str]], default: None ) –

List of tool names to filter. If None, all tools are returned.

Returns:

Any: List of tools that match the filter criteria.

Source code in lazyllm/tools/mcp/client.py

    def get_tools(self, allowed_tools: list[str] = None):
        """Retrieve a filtered list of tools from the MCP client.

Args:
    allowed_tools (Optional[list[str]]): List of tool names to filter. If None, all tools are returned.

**Returns:**

- Any: List of tools that match the filter criteria.
"""
        return patch_sync(self.aget_tools)(allowed_tools=allowed_tools)

`list_tools()` `async`

Retrieve the list of tools from the currently connected MCP client.

Returns:

Any: The list of tools returned by the MCP client.

Source code in lazyllm/tools/mcp/client.py

    async def list_tools(self):
        """Retrieve the list of tools from the currently connected MCP client.

**Returns:**

- Any: The list of tools returned by the MCP client.
"""
        async with self._run_session() as session:
            return await session.list_tools()

`lazyllm.tools.tools.GoogleSearch`

Bases: SearchBase

Search via Google Custom Search. Requires API key and search engine ID.

How to get API key and search engine ID: 1. Go to Google Cloud Console https://console.cloud.google.com/ , create or select a project. 2. Enable "Custom Search API", create an API key under "Credentials" to get custom_search_api_key. 3. Go to Programmable Search Engine https://programmablesearchengine.google.com/ to create a search engine; the "Search engine ID" (cx) is in the control panel.

Parameters:

custom_search_api_key (str, default: None ) –

Google API key; when omitted, reads dynamic_tool_auth["google"].
search_engine_id (str, default: '' ) –

Search engine ID for retrieval.
timeout (int, default: 10 ) –

Request timeout in seconds, default 10.
proxies (Dict[str, str], default: None ) –

Proxy config, see https://www.python-httpx.org/advanced/proxies.
source_name (str, default: 'google' ) –

Source identifier in results, default "google".

Examples:

from lazyllm.tools.tools import GoogleSearch
key = '<your_google_search_api_key>'
cx = '<your_search_engine_id>'
google = GoogleSearch(custom_search_api_key=key, search_engine_id=cx)

Source code in lazyllm/tools/tools/search/google_search.py

class GoogleSearch(SearchBase):
    """
Search via Google Custom Search. Requires API key and search engine ID.

How to get API key and search engine ID:
1. Go to Google Cloud Console https://console.cloud.google.com/ , create or select a project.
2. Enable "Custom Search API", create an API key under "Credentials" to get custom_search_api_key.
3. Go to Programmable Search Engine https://programmablesearchengine.google.com/ to create a search engine; the "Search engine ID" (cx) is in the control panel.

Args:
    custom_search_api_key (str, optional): Google API key; when omitted, reads dynamic_tool_auth["google"].
    search_engine_id (str): Search engine ID for retrieval.
    timeout (int): Request timeout in seconds, default 10.
    proxies (Dict[str, str], optional): Proxy config, see https://www.python-httpx.org/advanced/proxies.
    source_name (str): Source identifier in results, default "google".


Examples:

    from lazyllm.tools.tools import GoogleSearch
    key = '<your_google_search_api_key>'
    cx = '<your_search_engine_id>'
    google = GoogleSearch(custom_search_api_key=key, search_engine_id=cx)
    """

    def __init__(self, custom_search_api_key: Optional[str] = None, search_engine_id: str = '',
                 base_url: str = 'https://customsearch.googleapis.com/customsearch/v1',
                 timeout: int = 10, proxies: Optional[Dict[str, str]] = None,
                 source_name: str = 'google'):
        super().__init__(
            source_name=source_name, api_key=custom_search_api_key,
            auth_strategy=QueryParamStrategy('key'),
            dynamic_auth=(custom_search_api_key is None),
        )
        params = {
            'key': '{{api_key}}',
            'cx': '{{search_engine_id}}',
            'q': '{{query}}',
            'dateRestrict': '{{date_restrict}}',
            'start': 0,
            'num': 10,
        }
        self._http = HttpTool(
            method='GET',
            url=base_url,
            params=params,
            timeout=timeout,
            proxies=proxies,
        )
        self._search_engine_id = search_engine_id

    def search(self, query: str,
               date_restrict: str = 'm1',
               search_engine_id: Optional[str] = None) -> List[Dict[str, Any]]:
        """
Execute Google search.

Args:
    query (str): Search keywords.
    date_restrict (str): Content freshness, default "m1" (past month). See Google Custom Search API docs.
    search_engine_id (str, optional): Search engine ID; if empty, uses constructor value.

Returns:
    List[Dict[str, Any]]: Search results in the unified format. Each item contains title, url, snippet, and source.


Examples:

    from lazyllm.tools.tools import GoogleSearch
    google = GoogleSearch('<api_key>', '<search_engine_id>')
    res = google('machine learning', date_restrict='m1')
    """
        sid = search_engine_id or self._search_engine_id
        raw = self._http.forward(
            query=query,
            search_engine_id=sid,
            date_restrict=date_restrict,
            api_key=self.get_current_token(),
        )
        if not raw or not isinstance(raw, dict):
            return []
        items = raw.get('items') or []
        return [
            _make_result(
                title=it.get('title', ''),
                url=it.get('link', ''),
                snippet=it.get('snippet', ''),
                source=self.source_name,
            )
            for it in items
        ]

`search(query, date_restrict='m1', search_engine_id=None)`

Execute Google search.

Parameters:

query (str) –

Search keywords.
date_restrict (str, default: 'm1' ) –

Content freshness, default "m1" (past month). See Google Custom Search API docs.
search_engine_id (str, default: None ) –

Search engine ID; if empty, uses constructor value.

Returns:

List[Dict[str, Any]] –

List[Dict[str, Any]]: Search results in the unified format. Each item contains title, url, snippet, and source.

Examples:

from lazyllm.tools.tools import GoogleSearch
google = GoogleSearch('<api_key>', '<search_engine_id>')
res = google('machine learning', date_restrict='m1')

Source code in lazyllm/tools/tools/search/google_search.py

    def search(self, query: str,
               date_restrict: str = 'm1',
               search_engine_id: Optional[str] = None) -> List[Dict[str, Any]]:
        """
Execute Google search.

Args:
    query (str): Search keywords.
    date_restrict (str): Content freshness, default "m1" (past month). See Google Custom Search API docs.
    search_engine_id (str, optional): Search engine ID; if empty, uses constructor value.

Returns:
    List[Dict[str, Any]]: Search results in the unified format. Each item contains title, url, snippet, and source.


Examples:

    from lazyllm.tools.tools import GoogleSearch
    google = GoogleSearch('<api_key>', '<search_engine_id>')
    res = google('machine learning', date_restrict='m1')
    """
        sid = search_engine_id or self._search_engine_id
        raw = self._http.forward(
            query=query,
            search_engine_id=sid,
            date_restrict=date_restrict,
            api_key=self.get_current_token(),
        )
        if not raw or not isinstance(raw, dict):
            return []
        items = raw.get('items') or []
        return [
            _make_result(
                title=it.get('title', ''),
                url=it.get('link', ''),
                snippet=it.get('snippet', ''),
                source=self.source_name,
            )
            for it in items
        ]

`lazyllm.tools.tools.TencentSearch`

Bases: SearchBase

Tencent search wrapper for Tencent Cloud content search (SearchPro).

How to get SecretId and SecretKey: 1. Log in to Tencent Cloud Console https://console.cloud.tencent.com/ . 2. Go to "Access Management" -> "API Key Management" https://console.cloud.tencent.com/cam/capi to create or view keys (SecretId, SecretKey). 3. Enable the relevant search/product (e.g. text content safety) and ensure the account has SearchPro API permission.

Examples:

from lazyllm.tools.tools import TencentSearch
searcher = TencentSearch(secret_id='<your_secret_id>', secret_key='<your_secret_key>')

Source code in lazyllm/tools/tools/search/tencent_search.py

class TencentSearch(SearchBase):
    """
Tencent search wrapper for Tencent Cloud content search (SearchPro).

How to get SecretId and SecretKey:
1. Log in to Tencent Cloud Console https://console.cloud.tencent.com/ .
2. Go to "Access Management" -> "API Key Management" https://console.cloud.tencent.com/cam/capi to create or view keys (SecretId, SecretKey).
3. Enable the relevant search/product (e.g. text content safety) and ensure the account has SearchPro API permission.


Examples:

    from lazyllm.tools.tools import TencentSearch
    searcher = TencentSearch(secret_id='<your_secret_id>', secret_key='<your_secret_key>')
    """

    def __init__(self, secret_id: str, secret_key: str, source_name: str = 'tencent'):
        # Tencent SDK signs requests internally; no SearchBase strategy needed.
        super().__init__(source_name=source_name)
        from tencentcloud.common.common_client import CommonClient
        from tencentcloud.common import credential
        from tencentcloud.common.profile.client_profile import ClientProfile
        from tencentcloud.common.profile.http_profile import HttpProfile

        self._cred = credential.Credential(secret_id, secret_key)
        http_profile = HttpProfile()
        http_profile.endpoint = 'tms.tencentcloudapi.com'
        client_profile = ClientProfile()
        client_profile.httpProfile = http_profile
        self._headers = {'X-TC-Action': 'SearchPro'}
        self._client = CommonClient(
            'tms', '2020-12-29', self._cred, '', profile=client_profile)

    def search(self, query: str) -> List[Dict[str, Any]]:
        """
Execute Tencent Cloud search.

Args:
    query (str): User query string.

Returns:
    List[Dict[str, Any]]: List of results in unified format; empty list on error.


Examples:

    from lazyllm.tools.tools import TencentSearch
    searcher = TencentSearch(secret_id='<id>', secret_key='<key>')
    res = searcher('calculus')
    """
        res_dict = self._client.call_json(
            'SearchPro', {'Query': query, 'Mode': 2}, headers=self._headers)
        pages = res_dict.get('Response', {}).get('Pages') or []
        out: List[Dict[str, Any]] = []
        for p in pages:
            title = p.get('Title') or p.get('title') or ''
            url = p.get('Url') or p.get('url') or p.get('Link') or p.get('link') or ''
            snippet = p.get('Snippet') or p.get('snippet') or p.get('Description') or p.get('description') or ''
            out.append(_make_result(title=title, url=url, snippet=snippet, source=self.source_name))
        return out

`search(query)`

Execute Tencent Cloud search.

Parameters:

query (str) –

User query string.

Returns:

List[Dict[str, Any]] –

List[Dict[str, Any]]: List of results in unified format; empty list on error.

Examples:

from lazyllm.tools.tools import TencentSearch
searcher = TencentSearch(secret_id='<id>', secret_key='<key>')
res = searcher('calculus')

Source code in lazyllm/tools/tools/search/tencent_search.py

    def search(self, query: str) -> List[Dict[str, Any]]:
        """
Execute Tencent Cloud search.

Args:
    query (str): User query string.

Returns:
    List[Dict[str, Any]]: List of results in unified format; empty list on error.


Examples:

    from lazyllm.tools.tools import TencentSearch
    searcher = TencentSearch(secret_id='<id>', secret_key='<key>')
    res = searcher('calculus')
    """
        res_dict = self._client.call_json(
            'SearchPro', {'Query': query, 'Mode': 2}, headers=self._headers)
        pages = res_dict.get('Response', {}).get('Pages') or []
        out: List[Dict[str, Any]] = []
        for p in pages:
            title = p.get('Title') or p.get('title') or ''
            url = p.get('Url') or p.get('url') or p.get('Link') or p.get('link') or ''
            snippet = p.get('Snippet') or p.get('snippet') or p.get('Description') or p.get('description') or ''
            out.append(_make_result(title=title, url=url, snippet=snippet, source=self.source_name))
        return out

`lazyllm.tools.rag.web.WebUi`

A Gradio-based web UI for managing knowledge base files.

This class provides an interactive UI to create/delete groups, upload files, list files, and perform deletion operations via RESTful APIs. It is designed for rapid integration of file and group management.

Parameters:

base_url (str) –

Base URL of the backend API service.

Source code in lazyllm/tools/rag/web.py

class WebUi:
    """A Gradio-based web UI for managing knowledge base files.

This class provides an interactive UI to create/delete groups, upload files, list files, and perform deletion operations via RESTful APIs. It is designed for rapid integration of file and group management.

Args:
    base_url (str): Base URL of the backend API service.
"""
    def __init__(self, base_url) -> None:
        self.base_url = base_url

    def basic_headers(self, content_type=True):
        """
Generate standard HTTP headers.

Args:
    content_type (bool): Whether to include Content-Type in the headers (default: True).

**Returns:**

- dict: Dictionary of HTTP headers.
"""
        return {
            'accept': 'application/json',
            'Content-Type': 'application/json' if content_type else None,
        }

    def muti_headers(
        self,
    ):
        """
Generates multipart form HTTP request headers.
Used for requests requiring multipart/form-data format such as file uploads.

**Returns:**

- Dict: Returns HTTP request header dictionary containing accept header.
"""
        return {'accept': 'application/json'}

    def post_request(self, url, data):
        """
Send a POST request.

Args:
    url (str): Target request URL.
    data (dict): Request data (will be serialized as JSON).

**Returns:**

- dict: JSON response from the server.
"""
        response = requests.post(
            url, headers=self.basic_headers(), data=json.dumps(data)
        )
        return response.json()

    def get_request(self, url):
        """
Send a GET request.

Args:
    url (str): Target request URL.

**Returns:**

- dict: JSON response from the server.
"""
        response = requests.get(url, headers=self.basic_headers(False))
        return response.json()

    def new_group(self, group_name: str):
        """
Create a new file group.

Args:
    group_name (str): Name of the new group.

**Returns:**

- str: Server message about the creation result.
"""
        response = requests.post(
            f'{self.base_url}/new_group?group_name={group_name}',
            headers=self.basic_headers(True),
        )
        return response.json()['msg']

    def delete_group(self, group_name: str):
        """
Delete a specific file group.

Args:
    group_name (str): Name of the group to delete.

**Returns:**

- str: Server message about the deletion.
"""
        response = requests.post(
            f'{self.base_url}/delete_group?group_name={group_name}',
            headers=self.basic_headers(True),
        )
        return response.json()['msg']

    def list_groups(self):
        """
Gets all knowledge base group list.
Sends request to backend API to get all current knowledge base group information.

**Returns:**

- List: Returns group name list.
"""
        response = requests.get(f'{self.base_url}/list_kb_groups', headers=self.basic_headers(False))
        payload = response.json()
        if 'data' in payload:
            return payload['data']
        response = requests.get(f'{self.base_url}/v1/kbs', headers=self.basic_headers(False))
        payload = response.json()
        return [item.get('kb_id') for item in payload.get('data', {}).get('items', []) if item.get('kb_id')]

    def upload_files(self, group_name: str, override: bool = True):
        """
Upload files to a specified group.

Args:
    group_name (str): Name of the group.
    override (bool): Whether to override existing files (default: True).

**Returns:**

- Any: Data returned by the backend.
"""
        response = requests.post(
            f'{self.base_url}/upload_files?group_name={group_name}&override={override}',
            headers=self.basic_headers(True),
        )
        return response.json()['data']

    def list_files_in_group(self, group_name: str):
        """
List all files within a specific group.

Args:
    group_name (str): Name of the group.

**Returns:**

- List: List of file information.
"""
        response = requests.get(
            f'{self.base_url}/list_files_in_group?group_name={group_name}&alive=True',
            headers=self.basic_headers(False),
        )
        return response.json()['data']

    def delete_file(self, group_name: str, file_ids: list[str]):
        """
Delete specific files from a group.

Args:
    group_name (str): Name of the group.
    file_ids (List[str]): IDs of files to delete.

**Returns:**

- str: Deletion result message.
"""
        response = requests.post(
            f'{self.base_url}/delete_files_from_group',
            headers=self.basic_headers(True),
            json={'group_name': group_name, 'file_ids': file_ids}
        )
        return response.json()['msg']

    def gr_show_list(self, str_list: list, list_name: Union[str, list]):
        """
Display a list of strings as a Gradio DataFrame.

Args:
    str_list (List): List of strings or rows.
    list_name (Union[str, List]): Column name(s) for the table.

**Returns:**

- gr.DataFrame: Gradio DataFrame component.
"""
        if isinstance(list_name, str):
            headers = ['index', list_name]
            value = [[index, str_list[index]] for index in range(len(str_list))]
        else:
            headers = ['index'] + list_name
            value = [[index] + str_list[index:index + len(list_name)] for index in range(len(str_list))]
        return gr.DataFrame(headers=headers, value=value)

    def create_ui(self):
        """
Builds Gradio interface with multiple tabs, providing the following functionalities:
    - Group List: View all group information
    - Upload Files: Select group and upload files
    - Group File List: View files in specified group
    - Delete Files: Delete specified files from group

**Returns:**

- gr.Blocks: A complete Gradio application instance.
"""
        with gr.Blocks(analytics_enabled=False) as demo:
            with gr.Tabs():
                select_group_list = []

                with gr.TabItem('分组列表'):
                    select_group = self.gr_show_list(
                        self.list_groups(), list_name='group_name'
                    )
                    select_group_list.append(select_group)

                with gr.TabItem('上传文件'):

                    def _upload_files(group_name, files):
                        if not group_name:
                            gr.Info('请先选择分组')
                            return
                        if not files:
                            gr.Info('请先选择要上传的文件')
                            return
                        files_to_upload = [
                            ('files', (os.path.basename(file), open(file, 'rb')))
                            for file in files
                        ]

                        url = f'{self.base_url}/add_files_to_group?group_name={group_name}&override=true'
                        response = requests.post(
                            url, files=files_to_upload, headers=self.muti_headers()
                        )
                        response.raise_for_status()
                        response_data = response.json()
                        gr.Info(str(response_data['msg']))

                        for _, (_, file_obj) in files_to_upload:
                            file_obj.close()

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    select_group.change(lambda x: x, inputs=select_group, outputs=None)

                    up_files = gr.Files(label='上传文件')
                    up_btn = gr.Button('上传')
                    up_btn.click(
                        _upload_files,
                        inputs=[select_group, up_files],
                        outputs=None,
                    )

                    select_group_list.append(select_group)

                with gr.TabItem('分组文件列表'):
                    def _list_group_files(group_name):
                        file_list = self.list_files_in_group(group_name)
                        values = [[i] + file_list[i][:2] for i in range(len(file_list))]
                        return gr.update(value=values)

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    refresh_btn = gr.Button('刷新')
                    show_list = self.gr_show_list([], list_name=['file_id', 'file_name'])

                    select_group.change(fn=_list_group_files, inputs=select_group, outputs=show_list)
                    refresh_btn.click(fn=_list_group_files, inputs=select_group, outputs=show_list)

                with gr.TabItem('删除文件'):

                    def _list_group_files(group_name):
                        file_list = self.list_files_in_group(group_name)
                        file_list = [','.join(file[:2]) for file in file_list]
                        return gr.update(choices=file_list)

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    refresh_btn = gr.Button('刷新文件列表')
                    select_file = gr.Dropdown([], label='选择文件')

                    select_group.change(fn=_list_group_files, inputs=select_group, outputs=select_file)
                    refresh_btn.click(fn=_list_group_files, inputs=select_group, outputs=select_file)

                    delete_btn = gr.Button('删除')

                    def _delete_file(group_name, select_file):
                        if not group_name:
                            gr.Info('请先选择分组')
                            return gr.update()
                        if not select_file:
                            gr.Info('请先选择要删除的文件')
                            return gr.update()
                        file_ids = [select_file.split(',')[0]]
                        gr.Info(self.delete_file(group_name, file_ids))
                        return _list_group_files(group_name)

                    delete_btn.click(
                        fn=_delete_file,
                        inputs=[select_group, select_file],
                        outputs=select_file,
                    )
                    select_group_list.append(select_group)

        return demo

`basic_headers(content_type=True)`

Generate standard HTTP headers.

Parameters:

content_type (bool, default: True ) –

Whether to include Content-Type in the headers (default: True).

Returns:

dict: Dictionary of HTTP headers.

Source code in lazyllm/tools/rag/web.py

    def basic_headers(self, content_type=True):
        """
Generate standard HTTP headers.

Args:
    content_type (bool): Whether to include Content-Type in the headers (default: True).

**Returns:**

- dict: Dictionary of HTTP headers.
"""
        return {
            'accept': 'application/json',
            'Content-Type': 'application/json' if content_type else None,
        }

`create_ui()`

Builds Gradio interface with multiple tabs, providing the following functionalities: - Group List: View all group information - Upload Files: Select group and upload files - Group File List: View files in specified group - Delete Files: Delete specified files from group

Returns:

gr.Blocks: A complete Gradio application instance.

Source code in lazyllm/tools/rag/web.py

    def create_ui(self):
        """
Builds Gradio interface with multiple tabs, providing the following functionalities:
    - Group List: View all group information
    - Upload Files: Select group and upload files
    - Group File List: View files in specified group
    - Delete Files: Delete specified files from group

**Returns:**

- gr.Blocks: A complete Gradio application instance.
"""
        with gr.Blocks(analytics_enabled=False) as demo:
            with gr.Tabs():
                select_group_list = []

                with gr.TabItem('分组列表'):
                    select_group = self.gr_show_list(
                        self.list_groups(), list_name='group_name'
                    )
                    select_group_list.append(select_group)

                with gr.TabItem('上传文件'):

                    def _upload_files(group_name, files):
                        if not group_name:
                            gr.Info('请先选择分组')
                            return
                        if not files:
                            gr.Info('请先选择要上传的文件')
                            return
                        files_to_upload = [
                            ('files', (os.path.basename(file), open(file, 'rb')))
                            for file in files
                        ]

                        url = f'{self.base_url}/add_files_to_group?group_name={group_name}&override=true'
                        response = requests.post(
                            url, files=files_to_upload, headers=self.muti_headers()
                        )
                        response.raise_for_status()
                        response_data = response.json()
                        gr.Info(str(response_data['msg']))

                        for _, (_, file_obj) in files_to_upload:
                            file_obj.close()

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    select_group.change(lambda x: x, inputs=select_group, outputs=None)

                    up_files = gr.Files(label='上传文件')
                    up_btn = gr.Button('上传')
                    up_btn.click(
                        _upload_files,
                        inputs=[select_group, up_files],
                        outputs=None,
                    )

                    select_group_list.append(select_group)

                with gr.TabItem('分组文件列表'):
                    def _list_group_files(group_name):
                        file_list = self.list_files_in_group(group_name)
                        values = [[i] + file_list[i][:2] for i in range(len(file_list))]
                        return gr.update(value=values)

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    refresh_btn = gr.Button('刷新')
                    show_list = self.gr_show_list([], list_name=['file_id', 'file_name'])

                    select_group.change(fn=_list_group_files, inputs=select_group, outputs=show_list)
                    refresh_btn.click(fn=_list_group_files, inputs=select_group, outputs=show_list)

                with gr.TabItem('删除文件'):

                    def _list_group_files(group_name):
                        file_list = self.list_files_in_group(group_name)
                        file_list = [','.join(file[:2]) for file in file_list]
                        return gr.update(choices=file_list)

                    select_group = gr.Dropdown(self.list_groups(), label='选择分组')
                    refresh_btn = gr.Button('刷新文件列表')
                    select_file = gr.Dropdown([], label='选择文件')

                    select_group.change(fn=_list_group_files, inputs=select_group, outputs=select_file)
                    refresh_btn.click(fn=_list_group_files, inputs=select_group, outputs=select_file)

                    delete_btn = gr.Button('删除')

                    def _delete_file(group_name, select_file):
                        if not group_name:
                            gr.Info('请先选择分组')
                            return gr.update()
                        if not select_file:
                            gr.Info('请先选择要删除的文件')
                            return gr.update()
                        file_ids = [select_file.split(',')[0]]
                        gr.Info(self.delete_file(group_name, file_ids))
                        return _list_group_files(group_name)

                    delete_btn.click(
                        fn=_delete_file,
                        inputs=[select_group, select_file],
                        outputs=select_file,
                    )
                    select_group_list.append(select_group)

        return demo

`delete_file(group_name, file_ids)`

Delete specific files from a group.

Parameters:

group_name (str) –

Name of the group.
file_ids (List[str]) –

IDs of files to delete.

Returns:

str: Deletion result message.

Source code in lazyllm/tools/rag/web.py

    def delete_file(self, group_name: str, file_ids: list[str]):
        """
Delete specific files from a group.

Args:
    group_name (str): Name of the group.
    file_ids (List[str]): IDs of files to delete.

**Returns:**

- str: Deletion result message.
"""
        response = requests.post(
            f'{self.base_url}/delete_files_from_group',
            headers=self.basic_headers(True),
            json={'group_name': group_name, 'file_ids': file_ids}
        )
        return response.json()['msg']

`delete_group(group_name)`

Delete a specific file group.

Parameters:

group_name (str) –

Name of the group to delete.

Returns:

str: Server message about the deletion.

Source code in lazyllm/tools/rag/web.py

    def delete_group(self, group_name: str):
        """
Delete a specific file group.

Args:
    group_name (str): Name of the group to delete.

**Returns:**

- str: Server message about the deletion.
"""
        response = requests.post(
            f'{self.base_url}/delete_group?group_name={group_name}',
            headers=self.basic_headers(True),
        )
        return response.json()['msg']

`get_request(url)`

Send a GET request.

Parameters:

url (str) –

Target request URL.

Returns:

dict: JSON response from the server.

Source code in lazyllm/tools/rag/web.py

    def get_request(self, url):
        """
Send a GET request.

Args:
    url (str): Target request URL.

**Returns:**

- dict: JSON response from the server.
"""
        response = requests.get(url, headers=self.basic_headers(False))
        return response.json()

`gr_show_list(str_list, list_name)`

Display a list of strings as a Gradio DataFrame.

Parameters:

str_list (List) –

List of strings or rows.
list_name (Union[str, List]) –

Column name(s) for the table.

Returns:

gr.DataFrame: Gradio DataFrame component.

Source code in lazyllm/tools/rag/web.py

    def gr_show_list(self, str_list: list, list_name: Union[str, list]):
        """
Display a list of strings as a Gradio DataFrame.

Args:
    str_list (List): List of strings or rows.
    list_name (Union[str, List]): Column name(s) for the table.

**Returns:**

- gr.DataFrame: Gradio DataFrame component.
"""
        if isinstance(list_name, str):
            headers = ['index', list_name]
            value = [[index, str_list[index]] for index in range(len(str_list))]
        else:
            headers = ['index'] + list_name
            value = [[index] + str_list[index:index + len(list_name)] for index in range(len(str_list))]
        return gr.DataFrame(headers=headers, value=value)

`list_files_in_group(group_name)`

List all files within a specific group.

Parameters:

group_name (str) –

Name of the group.

Returns:

List: List of file information.

Source code in lazyllm/tools/rag/web.py

    def list_files_in_group(self, group_name: str):
        """
List all files within a specific group.

Args:
    group_name (str): Name of the group.

**Returns:**

- List: List of file information.
"""
        response = requests.get(
            f'{self.base_url}/list_files_in_group?group_name={group_name}&alive=True',
            headers=self.basic_headers(False),
        )
        return response.json()['data']

`list_groups()`

Gets all knowledge base group list. Sends request to backend API to get all current knowledge base group information.

Returns:

List: Returns group name list.

Source code in lazyllm/tools/rag/web.py

    def list_groups(self):
        """
Gets all knowledge base group list.
Sends request to backend API to get all current knowledge base group information.

**Returns:**

- List: Returns group name list.
"""
        response = requests.get(f'{self.base_url}/list_kb_groups', headers=self.basic_headers(False))
        payload = response.json()
        if 'data' in payload:
            return payload['data']
        response = requests.get(f'{self.base_url}/v1/kbs', headers=self.basic_headers(False))
        payload = response.json()
        return [item.get('kb_id') for item in payload.get('data', {}).get('items', []) if item.get('kb_id')]

`muti_headers()`

Generates multipart form HTTP request headers. Used for requests requiring multipart/form-data format such as file uploads.

Returns:

Dict: Returns HTTP request header dictionary containing accept header.

Source code in lazyllm/tools/rag/web.py

    def muti_headers(
        self,
    ):
        """
Generates multipart form HTTP request headers.
Used for requests requiring multipart/form-data format such as file uploads.

**Returns:**

- Dict: Returns HTTP request header dictionary containing accept header.
"""
        return {'accept': 'application/json'}

`new_group(group_name)`

Create a new file group.

Parameters:

group_name (str) –

Name of the new group.

Returns:

str: Server message about the creation result.

Source code in lazyllm/tools/rag/web.py

    def new_group(self, group_name: str):
        """
Create a new file group.

Args:
    group_name (str): Name of the new group.

**Returns:**

- str: Server message about the creation result.
"""
        response = requests.post(
            f'{self.base_url}/new_group?group_name={group_name}',
            headers=self.basic_headers(True),
        )
        return response.json()['msg']

`post_request(url, data)`

Send a POST request.

Parameters:

url (str) –

Target request URL.
data (dict) –

Request data (will be serialized as JSON).

Returns:

dict: JSON response from the server.

Source code in lazyllm/tools/rag/web.py

    def post_request(self, url, data):
        """
Send a POST request.

Args:
    url (str): Target request URL.
    data (dict): Request data (will be serialized as JSON).

**Returns:**

- dict: JSON response from the server.
"""
        response = requests.post(
            url, headers=self.basic_headers(), data=json.dumps(data)
        )
        return response.json()

`upload_files(group_name, override=True)`

Upload files to a specified group.

Parameters:

group_name (str) –

Name of the group.
override (bool, default: True ) –

Whether to override existing files (default: True).

Returns:

Any: Data returned by the backend.

Source code in lazyllm/tools/rag/web.py

    def upload_files(self, group_name: str, override: bool = True):
        """
Upload files to a specified group.

Args:
    group_name (str): Name of the group.
    override (bool): Whether to override existing files (default: True).

**Returns:**

- Any: Data returned by the backend.
"""
        response = requests.post(
            f'{self.base_url}/upload_files?group_name={group_name}&override={override}',
            headers=self.basic_headers(True),
        )
        return response.json()['data']

`lazyllm.tools.http_request.http_executor_response.HttpExecutorResponse`

HTTP executor response class for encapsulating and processing HTTP request response results.

Provides unified access interface for HTTP response content, supporting file type detection and content extraction.

Parameters:

response (Response, default: None ) –

httpx library response object, defaults to None

Returns:

HttpExecutorResponse instance, providing multiple response content access methods

Source code in lazyllm/tools/http_request/http_executor_response.py

class HttpExecutorResponse:
    """HTTP executor response class for encapsulating and processing HTTP request response results.

Provides unified access interface for HTTP response content, supporting file type detection and content extraction.

Args:
    response (httpx.Response, optional): httpx library response object, defaults to None

**Returns:**

- HttpExecutorResponse instance, providing multiple response content access methods
"""
    headers: dict[str, str]
    response: 'httpx.Response'

    def __init__(self, response: 'httpx.Response' = None):
        self.response = response
        self.headers = dict(response.headers) if isinstance(self.response, httpx.Response) else {}

    @property
    def is_file(self) -> bool:
        """
        check if response is file
        """
        content_type = self.get_content_type()
        file_content_types = ['image', 'audio', 'video']

        return any(v in content_type for v in file_content_types)

    def get_content_type(self) -> str:
        """Get the content type of the HTTP response.

Extracts the 'content-type' field value from the response headers to determine the type of response content.

**Returns:**

- str: The content type of the response, or empty string if not found.


Examples:
    >>> from lazyllm.tools.http_request.http_executor_response import HttpExecutorResponse
    >>> import httpx
    >>> response = httpx.Response(200, headers={'content-type': 'application/json'})
    >>> http_response = HttpExecutorResponse(response)
    >>> content_type = http_response.get_content_type()
    >>> print(content_type)
    ... 'application/json'
    """
        return self.headers.get('content-type', '')

    def extract_file(self) -> tuple[str, bytes]:
        """
        extract file from response if content type is file related
        """
        if self.is_file:
            return self.get_content_type(), self.body

        return '', b''

    @property
    def content(self) -> str:
        if isinstance(self.response, httpx.Response):
            return self.response.text
        else:
            raise ValueError(f'Invalid response type {type(self.response)}')

    @property
    def body(self) -> bytes:
        if isinstance(self.response, httpx.Response):
            return self.response.content
        else:
            raise ValueError(f'Invalid response type {type(self.response)}')

    @property
    def status_code(self) -> int:
        if isinstance(self.response, httpx.Response):
            return self.response.status_code
        else:
            raise ValueError(f'Invalid response type {type(self.response)}')

`is_file` `property`

check if response is file

`extract_file()`

extract file from response if content type is file related

Source code in lazyllm/tools/http_request/http_executor_response.py

def extract_file(self) -> tuple[str, bytes]:
    """
    extract file from response if content type is file related
    """
    if self.is_file:
        return self.get_content_type(), self.body

    return '', b''

`get_content_type()`

Get the content type of the HTTP response.

Extracts the 'content-type' field value from the response headers to determine the type of response content.

Returns:

str: The content type of the response, or empty string if not found.

Examples:

>>> from lazyllm.tools.http_request.http_executor_response import HttpExecutorResponse
>>> import httpx
>>> response = httpx.Response(200, headers={'content-type': 'application/json'})
>>> http_response = HttpExecutorResponse(response)
>>> content_type = http_response.get_content_type()
>>> print(content_type)
... 'application/json'

Source code in lazyllm/tools/http_request/http_executor_response.py

    def get_content_type(self) -> str:
        """Get the content type of the HTTP response.

Extracts the 'content-type' field value from the response headers to determine the type of response content.

**Returns:**

- str: The content type of the response, or empty string if not found.


Examples:
    >>> from lazyllm.tools.http_request.http_executor_response import HttpExecutorResponse
    >>> import httpx
    >>> response = httpx.Response(200, headers={'content-type': 'application/json'})
    >>> http_response = HttpExecutorResponse(response)
    >>> content_type = http_response.get_content_type()
    >>> print(content_type)
    ... 'application/json'
    """
        return self.headers.get('content-type', '')

`lazyllm.module.stream_helper.StreamCallHelper`

Helper class for streaming function calls, wrapping a blocking callable into a generator that yields results incrementally. Supports both a synchronous generator (__call__) and an async generator (astream).

Parameters:

impl (Callable) –

The function or callable to execute in streaming mode.
interval (float, default: 0.1 ) –

Time interval (in seconds) to poll the internal queue. Defaults to 0.1.
init_sid (Optional[bool], default: True ) –

Whether to initialize the globals and locals session IDs before submitting the background task. Defaults to True. Set it to False when the callable initializes the session ID itself to avoid overwriting an existing session context.

Source code in lazyllm/module/stream_helper.py

class StreamCallHelper:
    """Helper class for streaming function calls, wrapping a blocking callable into a generator
that yields results incrementally.
Supports both a synchronous generator (``__call__``) and an async generator (``astream``).

Args:
    impl (Callable): The function or callable to execute in streaming mode.
    interval (float): Time interval (in seconds) to poll the internal queue. Defaults to 0.1.
    init_sid (Optional[bool]): Whether to initialize the ``globals`` and ``locals`` session IDs before
        submitting the background task. Defaults to True. Set it to False when the callable initializes
        the session ID itself to avoid overwriting an existing session context.
"""
    def __init__(self, impl: Callable, interval: float = 0.1, *, init_sid: Optional[bool] = True):
        self._impl = impl
        self._sleep_interval = interval
        self.init_sid = init_sid
        self.future = None

    def _submit(self, *args, **kwargs):
        if self.init_sid:
            lazyllm.globals._init_sid()
            lazyllm.locals._init_sid()
        lazyllm.FileSystemQueue().clear()
        self.future = _g_stream_thread_pool.submit(self._impl, *args, **kwargs)
        return self.future

    def __call__(self, *args, **kwargs):
        future = self._submit(*args, **kwargs)
        yield from self._drain(future, time.sleep)

    async def astream(self, *args, **kwargs):
        """Async generator method that wraps the underlying blocking callable into an async streaming output,
yielding raw data dicts from the queue incrementally.

Executes ``impl`` in a background thread while polling ``FileSystemQueue`` for produced items.
Between polls it yields control back to the event loop via ``asyncio.sleep(interval)``.
After the background task finishes, any remaining items are drained from the queue.

Args:
    *args: Positional arguments forwarded to ``impl``.
    **kwargs: Keyword arguments forwarded to ``impl``.

Yields:
    dict: Raw data dicts dequeued from the internal queue, each containing a ``tag`` field
    identifying the data type (e.g. ``text``, ``think``, ``tool_calls``, ``tool_results``).

Note:
    - Suitable for asyncio / FastAPI contexts; does not block the event loop.
    - The ``future`` attribute holds the background task; use ``helper.future.result()`` to retrieve the final result.
    - Mirrors the logic of the synchronous ``__call__``, replacing ``time.sleep`` with ``await asyncio.sleep``.
"""
        future = self._submit(*args, **kwargs)
        async for item in self._adrain(future):
            yield item

    def _drain(self, future, sleep):
        q = lazyllm.FileSystemQueue()
        while not future.done():
            drained = False
            for item in self._drain_queue(q):
                drained = True
                yield item
            if not drained:
                sleep(self._sleep_interval)
        for item in self._drain_queue(q):
            yield item

    async def _adrain(self, future):
        q = lazyllm.FileSystemQueue()
        while not future.done():
            drained = False
            for item in self._drain_queue(q):
                drained = True
                yield item
            if not drained:
                await asyncio.sleep(self._sleep_interval)
        for item in self._drain_queue(q):
            yield item

    def _drain_queue(self, q):
        if values := q.dequeue():
            for raw in values:
                try:
                    payload = json.loads(raw)
                except json.JSONDecodeError:
                    continue
                if 'delta' in payload:
                    cleaned = _clean_chunk(str(payload['delta']))
                    if not cleaned:
                        continue
                    payload['delta'] = cleaned
                yield payload
            return True
        return False

`astream(*args, **kwargs)` `async`

Async generator method that wraps the underlying blocking callable into an async streaming output, yielding raw data dicts from the queue incrementally.

Executes impl in a background thread while polling FileSystemQueue for produced items. Between polls it yields control back to the event loop via asyncio.sleep(interval). After the background task finishes, any remaining items are drained from the queue.

Parameters:

*args –

Positional arguments forwarded to impl.
**kwargs –

Keyword arguments forwarded to impl.

Yields:

dict –

Raw data dicts dequeued from the internal queue, each containing a tag field
–

identifying the data type (e.g. text, think, tool_calls, tool_results).

Note

Suitable for asyncio / FastAPI contexts; does not block the event loop.
The future attribute holds the background task; use helper.future.result() to retrieve the final result.
Mirrors the logic of the synchronous __call__, replacing time.sleep with await asyncio.sleep.

Source code in lazyllm/module/stream_helper.py

    async def astream(self, *args, **kwargs):
        """Async generator method that wraps the underlying blocking callable into an async streaming output,
yielding raw data dicts from the queue incrementally.

Executes ``impl`` in a background thread while polling ``FileSystemQueue`` for produced items.
Between polls it yields control back to the event loop via ``asyncio.sleep(interval)``.
After the background task finishes, any remaining items are drained from the queue.

Args:
    *args: Positional arguments forwarded to ``impl``.
    **kwargs: Keyword arguments forwarded to ``impl``.

Yields:
    dict: Raw data dicts dequeued from the internal queue, each containing a ``tag`` field
    identifying the data type (e.g. ``text``, ``think``, ``tool_calls``, ``tool_results``).

Note:
    - Suitable for asyncio / FastAPI contexts; does not block the event loop.
    - The ``future`` attribute holds the background task; use ``helper.future.result()`` to retrieve the final result.
    - Mirrors the logic of the synchronous ``__call__``, replacing ``time.sleep`` with ``await asyncio.sleep``.
"""
        future = self._submit(*args, **kwargs)
        async for item in self._adrain(future):
            yield item

`lazyllm.tools.rag.LazyLLMStoreBase`

Bases: ABC

Base class for vector storage, defining the common interface specification. All concrete storage implementations (e.g., Chroma, Milvus) must inherit and implement this class.

Source code in lazyllm/tools/rag/store/store_base.py

class LazyLLMStoreBase(ABC, metaclass=LazyLLMRegisterMetaABCClass):
    """Base class for vector storage, defining the common interface specification.
All concrete storage implementations (e.g., Chroma, Milvus) must inherit and implement this class.
"""
    capability: StoreCapability
    need_embedding: bool = True
    supports_index_registration: bool = False

    @property
    def dir(self):
        raise NotImplementedError

    @abstractmethod
    def connect(self, *args, **kwargs):
        """Establish connection to the storage backend.

Args:
    *args: Variable positional arguments.
    **kwargs: Variable keyword arguments.
"""
        raise NotImplementedError

    @abstractmethod
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data in a collection.

Args:
    collection_name (str): The collection name.
    data (List[dict]): List of records to upsert.
"""
        raise NotImplementedError

    @abstractmethod
    def delete(self, collection_name: str, criteria: dict, **kwargs) -> bool:
        """Delete data from a collection.

Args:
    collection_name (str): The collection name.
    criteria (dict): Conditions for deletion.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

    @abstractmethod
    def get(self, collection_name: str, criteria: dict, **kwargs) -> List[dict]:
        """Retrieve data from a collection by criteria.

Args:
    collection_name (str): The collection name.
    criteria (dict): Filter conditions.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

    @abstractmethod
    def search(self, collection_name: str, query: Optional[str] = None,
               query_embedding: Optional[Union[dict, List[float]]] = None, topk: int = 10,
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        """Perform a search operation, supporting both text and vector queries.

Args:
    collection_name (str): The collection name.
    query (Optional[str]): Text query string.
    query_embedding (Optional[Union[dict, List[float]]]): Query vector.
    topk (int): Number of results to return. Defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Metadata filter conditions.
    embed_key (Optional[str]): Embedding key.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

    def seg_connect(self, *args, **kwargs):
        """Connect to the segment storage backend based on store capability.

``seg_connect()`` is a capability-aware entry point of ``connect()``, used to decouple
segment-store and vector-store initialisation. ``DocumentStore`` calls this method during
``_seg_init()``, typically passing only ``global_metadata_desc``.

Behaviour:

- If ``capability`` includes ``StoreCapability.SEGMENT`` (e.g. MapStore, ElasticSearchStore),
  delegates to ``connect(*args, **kwargs)``.
- If ``capability`` includes only ``StoreCapability.VECTOR`` (e.g. MilvusStore, ChromaStore),
  this is a no-op and no connection is established.

Args:
    *args: Positional arguments forwarded to ``connect()``.
    **kwargs: Keyword arguments forwarded to ``connect()``. Common keys include
        ``global_metadata_desc`` (global metadata schema description).

**Returns:**

- None
"""
        # For pure SEGMENT stores: seg_connect == connect.
        # For pure VECTOR stores: seg_connect is a no-op.
        if self.capability & StoreCapability.SEGMENT:
            self.connect(*args, **kwargs)

    def vec_connect(self, *args, **kwargs):
        """Connect to the vector storage backend based on store capability.

``vec_connect()`` is a capability-aware entry point of ``connect()``, used to decouple
vector-store and segment-store initialisation. ``DocumentStore`` calls this method during
``_vec_init()``, typically passing ``embed_dims``, ``embed_datatypes``, ``collections``,
and ``global_metadata_desc``.

Behaviour:

- If ``capability`` includes ``StoreCapability.VECTOR`` (e.g. MilvusStore, ChromaStore, MapStore),
  delegates to ``connect(*args, **kwargs)``.
- If ``capability`` includes only ``StoreCapability.SEGMENT`` (e.g. ElasticSearchStore,
  OpenSearchStore), this is a no-op and no connection is established.

Args:
    *args: Positional arguments forwarded to ``connect()``.
    **kwargs: Keyword arguments forwarded to ``connect()``. Common keys include
        ``embed_dims`` (vector dimension per embed key),
        ``embed_datatypes`` (data type per embed key),
        ``global_metadata_desc`` (global metadata schema description),
        ``collections`` (collection names to pre-create).

**Returns:**

- None
"""
        # For pure VECTOR stores: vec_connect == connect.
        # For pure SEGMENT stores: vec_connect is a no-op.
        if self.capability & StoreCapability.VECTOR:
            self.connect(*args, **kwargs)

    def collection_exists(self, collection_name: str) -> bool:
        """Return True when the collection exists in the backend.

        Default: assume it exists so callers that do not override this method
        keep their current behaviour.  Vector-store backends (e.g. Milvus)
        should override to perform a real check so that embedding computation
        can be skipped when the collection has not been created yet.
        """
        return True

    def try_read_dims_from_schema(self, collections: List[str]) -> Tuple[Dict[str, int], Dict[str, DataType]]:
        """Try to read embed_dims and embed_datatypes from existing backend schema.

        Default: no introspection supported. Subclasses override when the vector backend
        can describe collections before connect().
        """
        return {}, {}

`collection_exists(collection_name)`

Return True when the collection exists in the backend.

Default: assume it exists so callers that do not override this method keep their current behaviour. Vector-store backends (e.g. Milvus) should override to perform a real check so that embedding computation can be skipped when the collection has not been created yet.

Source code in lazyllm/tools/rag/store/store_base.py

def collection_exists(self, collection_name: str) -> bool:
    """Return True when the collection exists in the backend.

    Default: assume it exists so callers that do not override this method
    keep their current behaviour.  Vector-store backends (e.g. Milvus)
    should override to perform a real check so that embedding computation
    can be skipped when the collection has not been created yet.
    """
    return True

`connect(*args, **kwargs)` `abstractmethod`

Establish connection to the storage backend.

Parameters:

*args –

Variable positional arguments.
**kwargs –

Variable keyword arguments.

Source code in lazyllm/tools/rag/store/store_base.py

    @abstractmethod
    def connect(self, *args, **kwargs):
        """Establish connection to the storage backend.

Args:
    *args: Variable positional arguments.
    **kwargs: Variable keyword arguments.
"""
        raise NotImplementedError

`delete(collection_name, criteria, **kwargs)` `abstractmethod`

Delete data from a collection.

Parameters:

collection_name (str) –

The collection name.
criteria (dict) –

Conditions for deletion.
**kwargs –

Additional parameters.

Source code in lazyllm/tools/rag/store/store_base.py

    @abstractmethod
    def delete(self, collection_name: str, criteria: dict, **kwargs) -> bool:
        """Delete data from a collection.

Args:
    collection_name (str): The collection name.
    criteria (dict): Conditions for deletion.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

`get(collection_name, criteria, **kwargs)` `abstractmethod`

Retrieve data from a collection by criteria.

Parameters:

collection_name (str) –

The collection name.
criteria (dict) –

Filter conditions.
**kwargs –

Additional parameters.

Source code in lazyllm/tools/rag/store/store_base.py

    @abstractmethod
    def get(self, collection_name: str, criteria: dict, **kwargs) -> List[dict]:
        """Retrieve data from a collection by criteria.

Args:
    collection_name (str): The collection name.
    criteria (dict): Filter conditions.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

`search(collection_name, query=None, query_embedding=None, topk=10, filters=None, embed_key=None, **kwargs)` `abstractmethod`

Perform a search operation, supporting both text and vector queries.

Parameters:

collection_name (str) –

The collection name.
query (Optional[str], default: None ) –

Text query string.
query_embedding (Optional[Union[dict, List[float]]], default: None ) –

Query vector.
topk (int, default: 10 ) –

Number of results to return. Defaults to 10.
filters (Optional[Dict[str, Union[str, int, List, Set]]], default: None ) –

Metadata filter conditions.
embed_key (Optional[str], default: None ) –

Embedding key.
**kwargs –

Additional parameters.

Source code in lazyllm/tools/rag/store/store_base.py

    @abstractmethod
    def search(self, collection_name: str, query: Optional[str] = None,
               query_embedding: Optional[Union[dict, List[float]]] = None, topk: int = 10,
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        """Perform a search operation, supporting both text and vector queries.

Args:
    collection_name (str): The collection name.
    query (Optional[str]): Text query string.
    query_embedding (Optional[Union[dict, List[float]]]): Query vector.
    topk (int): Number of results to return. Defaults to 10.
    filters (Optional[Dict[str, Union[str, int, List, Set]]]): Metadata filter conditions.
    embed_key (Optional[str]): Embedding key.
    **kwargs: Additional parameters.
"""
        raise NotImplementedError

`seg_connect(*args, **kwargs)`

Connect to the segment storage backend based on store capability.

seg_connect() is a capability-aware entry point of connect(), used to decouple segment-store and vector-store initialisation. DocumentStore calls this method during _seg_init(), typically passing only global_metadata_desc.

Behaviour:

If capability includes StoreCapability.SEGMENT (e.g. MapStore, ElasticSearchStore), delegates to connect(*args, **kwargs).
If capability includes only StoreCapability.VECTOR (e.g. MilvusStore, ChromaStore), this is a no-op and no connection is established.

Parameters:

*args –

Positional arguments forwarded to connect().
**kwargs –

Keyword arguments forwarded to connect(). Common keys include global_metadata_desc (global metadata schema description).

Returns:

None

Source code in lazyllm/tools/rag/store/store_base.py

    def seg_connect(self, *args, **kwargs):
        """Connect to the segment storage backend based on store capability.

``seg_connect()`` is a capability-aware entry point of ``connect()``, used to decouple
segment-store and vector-store initialisation. ``DocumentStore`` calls this method during
``_seg_init()``, typically passing only ``global_metadata_desc``.

Behaviour:

- If ``capability`` includes ``StoreCapability.SEGMENT`` (e.g. MapStore, ElasticSearchStore),
  delegates to ``connect(*args, **kwargs)``.
- If ``capability`` includes only ``StoreCapability.VECTOR`` (e.g. MilvusStore, ChromaStore),
  this is a no-op and no connection is established.

Args:
    *args: Positional arguments forwarded to ``connect()``.
    **kwargs: Keyword arguments forwarded to ``connect()``. Common keys include
        ``global_metadata_desc`` (global metadata schema description).

**Returns:**

- None
"""
        # For pure SEGMENT stores: seg_connect == connect.
        # For pure VECTOR stores: seg_connect is a no-op.
        if self.capability & StoreCapability.SEGMENT:
            self.connect(*args, **kwargs)

`try_read_dims_from_schema(collections)`

Try to read embed_dims and embed_datatypes from existing backend schema.

Default: no introspection supported. Subclasses override when the vector backend can describe collections before connect().

Source code in lazyllm/tools/rag/store/store_base.py

def try_read_dims_from_schema(self, collections: List[str]) -> Tuple[Dict[str, int], Dict[str, DataType]]:
    """Try to read embed_dims and embed_datatypes from existing backend schema.

    Default: no introspection supported. Subclasses override when the vector backend
    can describe collections before connect().
    """
    return {}, {}

`upsert(collection_name, data)` `abstractmethod`

Insert or update data in a collection.

Parameters:

collection_name (str) –

The collection name.
data (List[dict]) –

List of records to upsert.

Source code in lazyllm/tools/rag/store/store_base.py

    @abstractmethod
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        """Insert or update data in a collection.

Args:
    collection_name (str): The collection name.
    data (List[dict]): List of records to upsert.
"""
        raise NotImplementedError

`vec_connect(*args, **kwargs)`

Connect to the vector storage backend based on store capability.

vec_connect() is a capability-aware entry point of connect(), used to decouple vector-store and segment-store initialisation. DocumentStore calls this method during _vec_init(), typically passing embed_dims, embed_datatypes, collections, and global_metadata_desc.

Behaviour:

If capability includes StoreCapability.VECTOR (e.g. MilvusStore, ChromaStore, MapStore), delegates to connect(*args, **kwargs).
If capability includes only StoreCapability.SEGMENT (e.g. ElasticSearchStore, OpenSearchStore), this is a no-op and no connection is established.

Parameters:

*args –

Positional arguments forwarded to connect().
**kwargs –

Keyword arguments forwarded to connect(). Common keys include embed_dims (vector dimension per embed key), embed_datatypes (data type per embed key), global_metadata_desc (global metadata schema description), collections (collection names to pre-create).

Returns:

None

Source code in lazyllm/tools/rag/store/store_base.py

    def vec_connect(self, *args, **kwargs):
        """Connect to the vector storage backend based on store capability.

``vec_connect()`` is a capability-aware entry point of ``connect()``, used to decouple
vector-store and segment-store initialisation. ``DocumentStore`` calls this method during
``_vec_init()``, typically passing ``embed_dims``, ``embed_datatypes``, ``collections``,
and ``global_metadata_desc``.

Behaviour:

- If ``capability`` includes ``StoreCapability.VECTOR`` (e.g. MilvusStore, ChromaStore, MapStore),
  delegates to ``connect(*args, **kwargs)``.
- If ``capability`` includes only ``StoreCapability.SEGMENT`` (e.g. ElasticSearchStore,
  OpenSearchStore), this is a no-op and no connection is established.

Args:
    *args: Positional arguments forwarded to ``connect()``.
    **kwargs: Keyword arguments forwarded to ``connect()``. Common keys include
        ``embed_dims`` (vector dimension per embed key),
        ``embed_datatypes`` (data type per embed key),
        ``global_metadata_desc`` (global metadata schema description),
        ``collections`` (collection names to pre-create).

**Returns:**

- None
"""
        # For pure VECTOR stores: vec_connect == connect.
        # For pure SEGMENT stores: vec_connect is a no-op.
        if self.capability & StoreCapability.VECTOR:
            self.connect(*args, **kwargs)

`lazyllm.tools.rag.doc_impl.DocImpl`

Document implementation class for managing core document processing, storage, and retrieval functionalities.

Parameters:

embed (Dict[str, Callable]) –

Dictionary of embedding functions.
dataset_path (Optional[str], default: None ) –

Dataset directory path, defaults to None.
doc_files (Optional[str], default: None ) –

Document files path, defaults to None.
kb_group_name (Optional[str], default: None ) –

Knowledge base group name, defaults to default group name.
global_metadata_desc (Dict[str, GlobalMetadataDesc], default: None ) –

Global metadata description.
store (Optional[Union[Dict, LazyLLMStoreBase]], default: None ) –

Storage instance or configuration.
processor (Optional[DocumentProcessor], default: None ) –

Document processing service.
algo_name (Optional[str], default: None ) –

Algorithm name.
display_name (Optional[str], default: None ) –

Display name.
description (Optional[str], default: None ) –

Description information.

Source code in lazyllm/tools/rag/doc_impl.py

class DocImpl:
    """Document implementation class for managing core document processing, storage, and retrieval functionalities.

Args:
    embed (Dict[str, Callable]): Dictionary of embedding functions.
    dataset_path (Optional[str]): Dataset directory path, defaults to None.
    doc_files (Optional[str]): Document files path, defaults to None.
    kb_group_name (Optional[str]): Knowledge base group name, defaults to default group name.
    global_metadata_desc (Dict[str, GlobalMetadataDesc]): Global metadata description.
    store (Optional[Union[Dict, LazyLLMStoreBase]]): Storage instance or configuration.
    processor (Optional[DocumentProcessor]): Document processing service.
    algo_name (Optional[str]): Algorithm name.
    display_name (Optional[str]): Display name.
    description (Optional[str]): Description information.
"""
    _builtin_node_groups: Dict[str, Dict] = {}
    _global_node_groups: Dict[str, Dict] = {}
    _registered_file_reader: Dict[str, Callable] = {}

    def __init__(self, embed: Dict[str, Callable], dataset_path: Optional[str] = None,
                 doc_files: Optional[List[str]] = None, kb_group_name: Optional[str] = None,
                 global_metadata_desc: Dict[str, GlobalMetadataDesc] = None,
                 store: Optional[Union[Dict, LazyLLMStoreBase]] = None,
                 processor: Optional[DocumentProcessor] = None, algo_name: Optional[str] = None,
                 display_name: Optional[str] = None, description: Optional[str] = None,
                 schema_extractor: Optional[Union[LLMBase, SchemaExtractor]] = None):
        super().__init__()
        self._local_file_reader: Dict[str, Callable] = {}
        self._kb_group_name = kb_group_name or RAG_DEFAULT_GROUP_NAME
        self._dataset_path = dataset_path
        self._doc_files = doc_files
        self._reader = DirectoryReader(None, self._local_file_reader, DocImpl._registered_file_reader)
        self.node_groups: Dict[str, Dict] = {
            LAZY_ROOT_NAME: dict(parent=None, display_name='Original Source', group_type=NodeGroupType.ORIGINAL),
            LAZY_IMAGE_GROUP: dict(parent=None, display_name='Image Node', group_type=NodeGroupType.OTHER)
        }
        self.embed = {k: _EmbedWrapper(e) for k, e in embed.items()}
        self._global_metadata_desc = global_metadata_desc
        self._store = store  # NOTE: will be initialized in _lazy_init()
        self._activated_groups = set([LAZY_ROOT_NAME, LAZY_IMAGE_GROUP])
        # activated_embeddings maintains all node_groups and active embeddings
        self._activated_embeddings = {LAZY_ROOT_NAME: set(), LAZY_IMAGE_GROUP: set()}  # {group_name: {em1, em2, ...}}
        self._index_pending_registrations = []
        self._processor = processor
        self._algo_name = algo_name
        self._display_name = display_name
        self._description = description
        self._schema_extractor = schema_extractor

    def _init_node_groups(self):
        node_groups = DocImpl._builtin_node_groups.copy()
        node_groups.update(DocImpl._global_node_groups)
        node_groups.update(self.node_groups)
        self.node_groups = node_groups

        for group in node_groups: self._activated_embeddings.setdefault(group, set())
        self._activated_groups = set([g for g in self._activated_groups if g in node_groups])

        # use list to avoid `dictionary changed size during iteration` error
        def _activate_with_parents(group_name: str):
            cur = group_name
            while cur and cur not in self._activated_groups:
                self._activated_groups.add(cur)
                cur = self.node_groups[cur]['parent']

        def _is_descendant(desc: str, ancestor: str) -> bool:
            cur = self.node_groups.get(desc, {}).get('parent')
            while cur:
                if cur == ancestor:
                    return True
                cur = self.node_groups.get(cur, {}).get('parent')
            return False

        for group in list(self._activated_groups):
            _activate_with_parents(self.node_groups[group]['parent'])
            ref_group = self.node_groups[group].get('ref')
            if ref_group and ref_group in node_groups:
                parent_group = self.node_groups[group]['parent']
                if parent_group and not _is_descendant(ref_group, parent_group):
                    raise ValueError(f'Node group "{group}" has ref "{ref_group}" not under parent "{parent_group}".')
                _activate_with_parents(ref_group)

    @property
    def store(self) -> _DocumentStore:
        self._lazy_init()
        return self._store

    def _create_store(self):
        store = self._store
        if store is None and self._processor is not None:
            store = getattr(self._processor, '_store_conf', None)
        self._store = store or {'type': 'map'}
        # embed dims/datatypes are resolved in _DocumentStore._lazy_init()
        self._store.pop('metadata_store', None)
        self._store = _DocumentStore(store=self._store,
                                     group_embed_keys=self._activated_embeddings, embed=self.embed,
                                     embed_dims={}, embed_datatypes={},
                                     global_metadata_desc=self._global_metadata_desc)
        self._store.activate_group(self._activated_groups)

    @once_wrapper(reset_on_pickle=True)
    def _create_schema_extractor(self):
        if self._schema_extractor is None or isinstance(self._schema_extractor, SchemaExtractor):
            return
        elif isinstance(self._schema_extractor, LLMBase):
            metadata_store_config = (
                (self._store.pop('metadata_store', None) if self._store else None)
                or _get_default_db_config(db_name=f'{self._algo_name}_metadata')
            )
            self._schema_extractor = SchemaExtractor(db_config=metadata_store_config, llm=self._schema_extractor)
            return
        else:
            raise ValueError(f'Invalid type for schema extractor: {type(self._schema_extractor)}')

    @once_wrapper(reset_on_pickle=True)
    def _lazy_init(self) -> None:
        self._init_node_groups()
        self._create_schema_extractor()
        self._create_store()
        cloud = not (self._dataset_path or self._doc_files is not None)

        self._resolve_index_pending_registrations()
        if self._processor:
            if not isinstance(self._processor, DocumentProcessor):
                raise TypeError(
                    f'processor must be a DocumentProcessor instance, got {type(self._processor).__name__!r}'
                )
            policy = config['algo_register_policy'].strip().lower()
            self._processor.register_algorithm(self._algo_name, self._store, self._reader, self.node_groups,
                                               self._schema_extractor, self._display_name, self._description,
                                               policy=policy)
        else:
            self._processor = _Processor(self._store, self._build_schema_extractors_dict())

        # `cloud` is True iff both dataset_path and doc_files are absent. Otherwise do a
        # one-time ingest: DocImpl only owns the scan in map-store flows now (persistent
        # store + dataset_path auto-upgrades to DocServer; DocumentProcessor + dataset_path
        # is rejected at Document.__init__), so re-scan on every process start is harmless
        # and matches the empty map store. No background monitor is started.
        if not cloud:
            self._ingest_local_dataset()

    def _build_schema_extractors_dict(self):
        if self._schema_extractor is None:
            return {}
        ext_name = getattr(self._schema_extractor, 'name', None) or self._algo_name
        return {ext_name: self._schema_extractor}

    def _resolve_index_pending_registrations(self):
        for index_type, index_cls, index_args, index_kwargs in self._index_pending_registrations:
            args = [self._resolve_index_placeholder(arg) for arg in index_args]
            kwargs = {k: self._resolve_index_placeholder(v) for k, v in index_kwargs.items()}
            self._store.register_index(index_type, index_cls(*args, **kwargs))
        self._index_pending_registrations.clear()

    @staticmethod
    def _create_node_group_impl(cls, group_name, name, transform: Union[str, Callable],
                                parent: str = LAZY_ROOT_NAME, *, trans_node: Optional[bool] = None,
                                num_workers: int = 0, display_name: Optional[str] = None,
                                group_type: NodeGroupType = NodeGroupType.CHUNK, ref: str = None,
                                lazy_mode: Optional[str] = None, **kwargs):
        group_name, parent = str(group_name), str(parent)
        groups = getattr(cls, group_name)

        def get_trans(t): return TransformArgs.from_dict(t) if isinstance(t, dict) else t

        if isinstance(transform, (TransformArgs, tuple, list, dict)):
            err_msg = '{} should be set in transform when transform is Dict, TransformArgs or List[TransformArgs]'
            assert trans_node is None, err_msg.format('trans_node')
            assert num_workers == 0, err_msg.format('num_workers')
            assert not kwargs, err_msg.format('kwargs')
            transforms = ([get_trans(t) for t in transform] if isinstance(transform, (list, tuple)) else
                          get_trans(transform))
        else:
            transforms = TransformArgs(f=transform, trans_node=trans_node,
                                       num_workers=num_workers, kwargs=kwargs)

        if name in groups:
            existing = groups[name]
            existing_sig = existing.get('signature', '')
            parent_sig = groups.get(parent, {}).get('signature', '') if parent != LAZY_ROOT_NAME else ''
            ref_sig = groups.get(ref, {}).get('signature', '') if ref else ''
            new_sig = _compute_node_group_signature(name, transforms, parent_sig, ref_sig, group_type)
            if existing_sig and existing_sig == new_sig:
                LOG.info(f'Node group {name!r} already registered with same signature, skipping.')
                return
            if existing_sig and existing_sig != new_sig:
                raise ValueError(
                    f'Node group {name!r} already exists with a different signature '
                    f'(existing={existing_sig}, new={new_sig}). '
                    'Use a different name or version to create a distinct node group.'
                )
            # existing_sig is empty (legacy data without signature): update in-place
            LOG.warning('Node group %r has no signature (legacy data), updating in-place '
                        'with new signature %r', name, new_sig)
        for t in (transforms if isinstance(transform, list) else [transforms]):
            if isinstance(t.f, str):
                t.f = _transmap[t.f.lower()]
            if isinstance(t.f, type):
                assert t.trans_node is None, '`trans_node` is allowed only when transform is callable'
                if not issubclass(t.f, NodeTransform): LOG.warning(
                    'Please note! You are trying to use a completely custom transform class. The relationship '
                    'between nodes may become unreliable, `Document.get_parent/get_child` functions and the '
                    'target parameter of Retriever may have strange anomalies. Please use it at your own risk.')
            else:
                assert callable(t.f), f'transform should be callable, but get {t.f}'
        assert lazy_mode in (None, 'embed', 'all'), \
            f'lazy_mode must be None, "embed" or "all", got {lazy_mode!r}'
        parent_sig = groups.get(parent, {}).get('signature', '') if parent != LAZY_ROOT_NAME else ''
        ref_sig = groups.get(ref, {}).get('signature', '') if ref else ''
        signature = _compute_node_group_signature(name, transforms, parent_sig, ref_sig, group_type)
        groups[name] = dict(transform=transforms, parent=parent, display_name=display_name or name,
                            group_type=group_type, ref=ref, signature=signature, lazy_mode=lazy_mode)

    @classmethod
    def _create_builtin_node_group(cls, name, transform: Union[str, Callable], parent: str = LAZY_ROOT_NAME,
                                   *, trans_node: Optional[bool] = None, num_workers: int = 0,
                                   display_name: Optional[str] = None, group_type: NodeGroupType = NodeGroupType.CHUNK,
                                   lazy_mode: Optional[str] = None, **kwargs) -> None:
        DocImpl._create_node_group_impl(cls, '_builtin_node_groups', name=name, transform=transform, parent=parent,
                                        trans_node=trans_node, num_workers=num_workers, display_name=display_name,
                                        group_type=group_type, lazy_mode=lazy_mode, **kwargs)

    @classmethod
    def create_global_node_group(cls, name, transform: Union[str, Callable], parent: str = LAZY_ROOT_NAME, *,
                                 trans_node: Optional[bool] = None, num_workers: int = 0,
                                 display_name: Optional[str] = None,
                                 group_type: NodeGroupType = NodeGroupType.CHUNK, ref: str = None,
                                 version: Optional[str] = None, lazy_mode: Optional[str] = None, **kwargs) -> None:
        """Create a global node group.

Args:
    name (str): Node group name.
    transform (Union[str, Callable]): Transform function or name.
    parent (str): Parent node group name, defaults to LAZY_ROOT_NAME.
    trans_node (Optional[bool]): Whether to transform node, defaults to None.
    num_workers (int): Number of worker threads, defaults to 0.
    display_name (Optional[str]): Display name.
    group_type (NodeGroupType): Node group type.
    **kwargs: Additional arguments.
"""
        if version is not None:
            if not _VERSION_RE.match(version):
                raise ValueError(f'Invalid version {version!r}. Must follow PEP 440 (e.g. 1.0, 1.1.1, 1.1.1a0).')
            name = f'{name}@v{version}'
        DocImpl._create_node_group_impl(cls, '_global_node_groups', name=name, transform=transform, parent=parent,
                                        trans_node=trans_node, num_workers=num_workers, display_name=display_name,
                                        group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)

    def create_node_group(self, name, transform: Union[str, Callable], parent: str = LAZY_ROOT_NAME, *,
                          trans_node: Optional[bool] = None, num_workers: int = 0, display_name: Optional[str] = None,
                          group_type: NodeGroupType = NodeGroupType.CHUNK, ref: str = None,
                          version: Optional[str] = None, lazy_mode: Optional[str] = None, **kwargs) -> None:
        """Create a local node group.

Args:
    name (str): Node group name.
    transform (Union[str, Callable]): Transform function or name.
    parent (str): Parent node group name, defaults to LAZY_ROOT_NAME.
    trans_node (Optional[bool]): Whether to transform node, defaults to None.
    num_workers (int): Number of worker threads, defaults to 0.
    display_name (Optional[str]): Display name.
    group_type (NodeGroupType): Node group type.
    **kwargs: Additional arguments.
"""
        # NOTE: if parent itself is versioned, pass the full versioned name (e.g. "chunks@v1.0");
        # the version param does NOT auto-append a version suffix to parent.
        if version is not None:
            if not _VERSION_RE.match(version):
                raise ValueError(f'Invalid version {version!r}. Must follow PEP 440 (e.g. 1.0, 1.1.1, 1.1.1a0).')
            name = f'{name}@v{version}'
        if self._lazy_init.flag:
            if isinstance(self._processor, DocumentProcessor):
                self._processor.register_new_node_group(name, dict(
                    transform=transform, parent=parent, trans_node=trans_node,
                    num_workers=num_workers, display_name=display_name,
                    group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs,
                ), algo_name=self._algo_name)
                # Also update local node_groups so in-process callers see the new group.
                DocImpl._create_node_group_impl(self, 'node_groups', name=name, transform=transform, parent=parent,
                                                trans_node=trans_node, num_workers=num_workers,
                                                display_name=display_name, group_type=group_type, ref=ref,
                                                lazy_mode=lazy_mode, **kwargs)
                return
            raise RuntimeError('Cannot add node group after document started in standalone mode')
        DocImpl._create_node_group_impl(self, 'node_groups', name=name, transform=transform, parent=parent,
                                        trans_node=trans_node, num_workers=num_workers, display_name=display_name,
                                        group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)

    @classmethod
    def register_global_reader(cls, pattern: str, func: Optional[Callable] = None):
        """Register a global file reader.

Args:
    pattern (str): File pattern.
    func (Optional[Callable]): Reader function, defaults to None.

**Returns:**

- Optional[Callable]: Decorator function or None.
"""
        if func is not None:
            cls._registered_file_reader[pattern] = func
            return None

        def decorator(klass):
            if callable(klass): cls._registered_file_reader[pattern] = klass
            else: raise TypeError(f'The registered object {klass} is not a callable object.')
            return klass
        return decorator

    def _resolve_index_placeholder(self, value):
        if isinstance(value, StorePlaceholder): return self._store
        elif isinstance(value, EmbedPlaceholder): return self.embed
        return value

    def register_index(self, index_type: str, index_cls: IndexBase, *args, **kwargs) -> None:
        """Register an index.

Args:
    index_type (str): Index type.
    index_cls (IndexBase): Index class.
    *args: Positional arguments.
    **kwargs: Keyword arguments.
"""
        if bool(self._lazy_init.flag):
            args = [self._resolve_index_placeholder(arg) for arg in args]
            kwargs = {k: self._resolve_index_placeholder(v) for k, v in kwargs.items()}
            self._store.register_index(index_type, index_cls(*args, **kwargs))
        else:
            self._index_pending_registrations.append((index_type, index_cls, args, kwargs))

    def add_reader(self, pattern: str, func: Optional[Callable] = None):
        """Add a local file reader.

Args:
    pattern (str): File pattern.
    func (Optional[Callable]): Reader function.
"""
        assert callable(func), 'func for reader should be callable'
        self._local_file_reader[pattern] = func
        self._reader._lazy_init.flag.reset()

    def _add_doc_to_store(self, input_files: List[str], ids: List[str],
                          metadatas: List[Dict[str, Any]]) -> Set[str]:
        """Add documents to store. Returns the set of doc_ids that were successfully added."""
        success_ids: Set[str] = set()
        for filepath, doc_id, metadata in zip(input_files, ids, metadatas):
            filepath = os.path.abspath(filepath)
            try:
                self._processor.add_doc([filepath], self.node_groups, self._reader,
                                        [doc_id], [metadata] if metadata is not None else None)
                success_ids.add(doc_id)
            except Exception as e:
                LOG.error(f'Error adding document {doc_id} ({filepath}) to store: {e}')
        return success_ids

    def _list_dataset_files(self) -> List[str]:
        from .doc_service.utils import list_dataset_files
        return list_dataset_files(self._dataset_path)

    def _list_local_files(self) -> Tuple[List[str], List[str], List[Dict[str, Any]]]:
        paths = ([self._resolve_doc_file_path(path) for path in self._doc_files]
                 if self._doc_files is not None else self._list_dataset_files())
        ids = [gen_docid(path) for path in paths]
        return ids, paths, [{} for _ in paths]

    @staticmethod
    def _resolve_doc_file_path(path: str) -> str:
        # Normalize absolute paths too, matching the prior os.path.abspath(...)
        # behavior so gen_docid produces a stable id regardless of whether the
        # caller used "/abs/./x" vs "/abs/x" or mixed separators on Windows.
        if os.path.isabs(path):
            return os.path.abspath(path)
        if config['data_path']:
            candidate = os.path.join(config['data_path'], path)
            if os.path.exists(candidate):
                return os.path.abspath(candidate)
        return os.path.abspath(path)

    def _ingest_local_dataset(self):
        """One-time ingest at `_lazy_init`: load every file in `dataset_path` /
        `doc_files` into the store. No background polling / diff tracking —
        directory change detection now lives in DocServer's SQL-backed scan
        (persistent stores auto-upgrade to it).
        """
        ids, paths, metadatas = self._list_local_files()
        if paths:
            self._add_doc_to_store(paths, ids, metadatas)

    def _delete_doc_from_store(self, doc_ids: List[str] = None) -> None:
        self._processor.delete_doc(doc_ids=doc_ids)

    def activate_group(self, group_name: str, embed_keys: Optional[List[str]] = None, enable_embed: bool = True):
        """Activate a node group.

Args:
    group_name (str): Node group name.
    embed_keys (List[str]): List of embedding keys.
"""
        group_name = str(group_name)
        self._activated_groups.add(group_name)
        if embed_keys is None and enable_embed:
            embed_keys = self.embed.keys()
        if embed_keys:
            activated_embeddings = self._activated_embeddings.setdefault(group_name, set())
            if len(set(embed_keys) - activated_embeddings) == 0: return
            activated_embeddings.update(embed_keys)

        if self._lazy_init.flag:
            if group_name not in self.node_groups: return
            assert not embed_keys, 'Cannot add new embed_keys for node_group when Document is inited'
            self._store.activate_group(parent := group_name)
            while True:
                parent = self.node_groups[parent]['parent']
                if parent in self._activated_groups: break
                self._store.activate_group(parent)
                self._activated_groups.add(parent)
            if self._store.is_group_empty(group_name): self._processor.reparse(group_name, self.node_groups)

    def active_node_groups(self):
        """Get currently active node groups.

**Returns:**

- Dict: Active node groups and their embedding keys.
"""
        return {k: v for k, v in self._activated_embeddings.items() if k in self._activated_groups}

    def retrieve(self, query: str, group_name: str, similarity: str, similarity_cut_off: Union[float, Dict[str, float]],
                 index: str, topk: int, similarity_kws: dict, embed_keys: Optional[List[str]] = None,
                 filters: Optional[Dict[str, Union[str, int, List, Set]]] = None, **kwargs) -> List[DocNode]:
        """Retrieve document nodes.

Args:
    query (str): Query string.
    group_name (str): Node group name.
    similarity (str): Similarity calculation method.
    similarity_cut_off (Union[float, Dict[str, float]]): Similarity threshold.
    index (str): Index type.
    topk (int): Number of results to return.
    similarity_kws (dict): Similarity calculation parameters.
    embed_keys (Optional[List[str]]): List of embedding keys.
    filters (Optional[Dict]): Filter conditions.
    **kwargs: Additional arguments.

**Returns:**

- List[DocNode]: List of retrieved document nodes.
"""
        self._lazy_init()

        if index and index != 'default':
            query_instance = self._store.get_index(type=index)
            if query_instance is None:
                raise NotImplementedError(f'Index type "{index}" is not registered in the store.')
        else:
            query_instance = self._store.get_index(type='default') or self._store
        try:
            nodes = query_instance.query(query=query, group_name=group_name, similarity_name=similarity,
                                         similarity_cut_off=similarity_cut_off, topk=topk, embed_keys=embed_keys,
                                         filters=filters, **similarity_kws, **kwargs)
        except Exception as e:
            raise RuntimeError(f'index type `{index}` of store `{type(self._store._impl)}` query failed: {e}')

        for n in nodes:
            n._store = self._store
            n._node_groups = self.node_groups
            n._children_loaded = False
        return nodes

    def find(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find nodes in specified group.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found nodes.
"""
        if len(nodes) == 0: return nodes
        self._lazy_init()

        def get_full_path(name: str) -> List[str]:
            path = [name]
            while name != LAZY_ROOT_NAME:
                name = self.node_groups[name]['parent']
                path.append(name)
            return list(reversed(path))

        cur_group = nodes[0]._group
        path_cur = get_full_path(cur_group)
        path_tgt = get_full_path(group)

        idx = 0
        for a, b in zip(path_cur, path_tgt):
            if a == b:
                idx += 1
            else:
                break
        parent_path = list(reversed(path_cur[idx - 1:-1]))
        child_path = path_tgt[idx:]

        for next_group in parent_path:
            if not nodes: break
            nodes = self.find_parent(nodes, next_group)

        for next_group in child_path:
            if not nodes: break
            nodes = self.find_children(nodes, next_group)

        if not nodes:
            LOG.warning(f'We can not find any nodes for group `{group}`, please check your input')
            return []
        return nodes

    def find_parent(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find parent nodes.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found parent nodes.
"""
        if isinstance(nodes[0].parent, DocNode):
            result = self._find_parent_with_node(nodes, group)
        else:
            result = self._find_parent_with_uid(nodes, group)
        if not result:
            LOG.warning(
                f'We can not find any nodes for group `{group}`, please check your input'
            )
        LOG.debug(f'Found parent node for {group}: {result}')
        return result

    def _find_parent_with_node(self, nodes: list[DocNode], group: str):
        result = set()

        def recurse_parents(node: DocNode, visited: Set[DocNode]) -> None:
            if node.parent:
                if node.parent._group == group:
                    visited.add(node.parent)
                else:
                    recurse_parents(node.parent, visited)

        for node in nodes:
            recurse_parents(node, result)
        return list(result)

    def _find_parent_with_uid(self, nodes: list[DocNode], group: str):
        cur_group = nodes[0]._group
        cur_nodes = nodes
        while cur_group != group and cur_nodes[0].parent:
            name = self.node_groups[cur_group]['parent']
            parent_uids = {n.parent for n in cur_nodes}
            kb_id = cur_nodes[0].global_metadata.get(RAG_KB_ID)
            parents = self._store.get_nodes(group=name, kb_id=kb_id, uids=list(parent_uids), display=True)
            if not parents: break
            cur_group = parents[0]._group
            cur_nodes = parents
        return cur_nodes if cur_group == group else []

    def find_children(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find child nodes.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found child nodes.
"""
        if not nodes: return []
        kb_id = nodes[0].global_metadata.get(RAG_KB_ID, None)
        result = self._store.get_nodes(group=group, kb_id=kb_id, parent=[n._uid for n in nodes], display=True)
        if not result:
            LOG.warning(f'We cannot find any nodes for group `{group}`, please check your input.')
        LOG.debug(f'Found children nodes for {group}: {result}')
        return list(result)

    def clear_cache(self, group_names: Optional[List[str]] = None):
        """Clear cache.

Args:
    group_names (Optional[List[str]]): List of group names to clear cache for, defaults to None for clearing all cache.
"""
        self._store.clear_cache(group_names)

    def drop_algorithm(self):
        """Delete the algorithm information registered in the document parsing service for the current document collection.
"""
        if isinstance(self._processor, DocumentProcessor):
            self._processor.drop_algorithm(self._algo_name)
        else:
            raise ValueError('This method is only available when the Document has a DocumentProcessor')

    def _analyze_schema_by_llm(self, kb_id: Optional[str] = None, doc_ids: Optional[List[str]] = None):
        if not self._schema_extractor:
            raise AttributeError('No schema extractor for this Document.')
        self._create_schema_extractor()
        data = self._store.get_nodes(group=LAZY_ROOT_NAME, kb_id=kb_id, doc_ids=doc_ids)
        if not data:
            LOG.error(f'No data from store for kb_id: {kb_id}, doc_ids: {doc_ids}')
            return None
        return self._schema_extractor.analyze_schema_and_register(data=data)

    def _register_schema_set(self, schema_set: Type[BaseModel], kb_id: Optional[str] = DEFAULT_KB_ID,
                             force_refresh: bool = False) -> str:
        if not self._schema_extractor:
            raise AttributeError('No schema extractor for this Document.')
        self._create_schema_extractor()
        set_id = self._schema_extractor.register_schema_set(schema_set=schema_set, force_refresh=force_refresh)
        return set_id

    def _get_nodes(self, uids: Optional[List[str]] = None, doc_ids: Optional[Set] = None,
                   group: Optional[str] = None, kb_id: Optional[str] = None, numbers: Optional[Set] = None,
                   limit: Optional[int] = None, offset: int = 0, return_total: bool = False,
                   sort_by_number: bool = False) -> Union[List[DocNode], Tuple[List[DocNode], int]]:
        self._lazy_init()
        return self._store.get_nodes(
            uids=uids, doc_ids=doc_ids, group=group, kb_id=kb_id, numbers=numbers,
            limit=limit, offset=offset, return_total=return_total,
            sort_by_number=sort_by_number, display=True,
        )

    def _get_window_nodes(self, node: DocNode, span: tuple[int, int] = (-5, 5),
                          merge: bool = False) -> Union[List[DocNode], DocNode]:
        if node is None:
            return []
        self._lazy_init()

        if not isinstance(span, tuple) or len(span) != 2:
            raise ValueError('span must be a tuple of (start, end)')

        group = node.group
        if not group:
            LOG.warning('Window nodes query failed: node has no group')
            return []

        doc_id = node.global_metadata.get(RAG_DOC_ID)
        if not doc_id:
            LOG.warning('Window nodes query failed: node has no doc id')
            return []

        kb_id = node.global_metadata.get(RAG_KB_ID, DEFAULT_KB_ID)
        start, end = span
        if start > end:
            start, end = end, start

        node_number = node.number
        numbers = set(range(node_number + start, node_number + end + 1))
        numbers = {n for n in numbers if n > 0}

        if not numbers:
            return []

        nodes = self._store.get_nodes(group=group, kb_id=kb_id, doc_ids=[doc_id], numbers=numbers)
        nodes = sorted(nodes, key=lambda n: n.number)

        if not merge:
            return nodes

        if any(type(n) is not DocNode for n in nodes):
            # NOTE: QADocNode, ImageDocNode is not supported merge
            LOG.warning('Merge window nodes only supports DocNode, returning list instead')
            return nodes

        merged_text = '\n'.join([n.text for n in nodes]) if nodes else ''
        metadata = dict(node.metadata)
        metadata.pop('lazyllm_store_num', None)
        metadata.pop('number', None)
        merged_node = DocNode(content=merged_text, group=group, embedding=node.embedding, parent=node._parent,
                              metadata=metadata, global_metadata=dict(node.global_metadata))
        return merged_node

    def __call__(self, func_name: str, *args, **kwargs):
        return getattr(self, func_name)(*args, **kwargs)

`activate_group(group_name, embed_keys=None, enable_embed=True)`

Activate a node group.

Parameters:

group_name (str) –

Node group name.
embed_keys (List[str], default: None ) –

List of embedding keys.

Source code in lazyllm/tools/rag/doc_impl.py

    def activate_group(self, group_name: str, embed_keys: Optional[List[str]] = None, enable_embed: bool = True):
        """Activate a node group.

Args:
    group_name (str): Node group name.
    embed_keys (List[str]): List of embedding keys.
"""
        group_name = str(group_name)
        self._activated_groups.add(group_name)
        if embed_keys is None and enable_embed:
            embed_keys = self.embed.keys()
        if embed_keys:
            activated_embeddings = self._activated_embeddings.setdefault(group_name, set())
            if len(set(embed_keys) - activated_embeddings) == 0: return
            activated_embeddings.update(embed_keys)

        if self._lazy_init.flag:
            if group_name not in self.node_groups: return
            assert not embed_keys, 'Cannot add new embed_keys for node_group when Document is inited'
            self._store.activate_group(parent := group_name)
            while True:
                parent = self.node_groups[parent]['parent']
                if parent in self._activated_groups: break
                self._store.activate_group(parent)
                self._activated_groups.add(parent)
            if self._store.is_group_empty(group_name): self._processor.reparse(group_name, self.node_groups)

`active_node_groups()`

Get currently active node groups.

Returns:

Dict: Active node groups and their embedding keys.

Source code in lazyllm/tools/rag/doc_impl.py

    def active_node_groups(self):
        """Get currently active node groups.

**Returns:**

- Dict: Active node groups and their embedding keys.
"""
        return {k: v for k, v in self._activated_embeddings.items() if k in self._activated_groups}

`add_reader(pattern, func=None)`

Add a local file reader.

Parameters:

pattern (str) –

File pattern.
func (Optional[Callable], default: None ) –

Reader function.

Source code in lazyllm/tools/rag/doc_impl.py

    def add_reader(self, pattern: str, func: Optional[Callable] = None):
        """Add a local file reader.

Args:
    pattern (str): File pattern.
    func (Optional[Callable]): Reader function.
"""
        assert callable(func), 'func for reader should be callable'
        self._local_file_reader[pattern] = func
        self._reader._lazy_init.flag.reset()

`clear_cache(group_names=None)`

Clear cache.

Parameters:

group_names (Optional[List[str]], default: None ) –

List of group names to clear cache for, defaults to None for clearing all cache.

Source code in lazyllm/tools/rag/doc_impl.py

    def clear_cache(self, group_names: Optional[List[str]] = None):
        """Clear cache.

Args:
    group_names (Optional[List[str]]): List of group names to clear cache for, defaults to None for clearing all cache.
"""
        self._store.clear_cache(group_names)

`create_global_node_group(name, transform, parent=LAZY_ROOT_NAME, *, trans_node=None, num_workers=0, display_name=None, group_type=NodeGroupType.CHUNK, ref=None, version=None, lazy_mode=None, **kwargs)` `classmethod`

Create a global node group.

Parameters:

name (str) –

Node group name.
transform (Union[str, Callable]) –

Transform function or name.
parent (str, default: LAZY_ROOT_NAME ) –

Parent node group name, defaults to LAZY_ROOT_NAME.
trans_node (Optional[bool], default: None ) –

Whether to transform node, defaults to None.
num_workers (int, default: 0 ) –

Number of worker threads, defaults to 0.
display_name (Optional[str], default: None ) –

Display name.
group_type (NodeGroupType, default: CHUNK ) –

Node group type.
**kwargs –

Additional arguments.

Source code in lazyllm/tools/rag/doc_impl.py

    @classmethod
    def create_global_node_group(cls, name, transform: Union[str, Callable], parent: str = LAZY_ROOT_NAME, *,
                                 trans_node: Optional[bool] = None, num_workers: int = 0,
                                 display_name: Optional[str] = None,
                                 group_type: NodeGroupType = NodeGroupType.CHUNK, ref: str = None,
                                 version: Optional[str] = None, lazy_mode: Optional[str] = None, **kwargs) -> None:
        """Create a global node group.

Args:
    name (str): Node group name.
    transform (Union[str, Callable]): Transform function or name.
    parent (str): Parent node group name, defaults to LAZY_ROOT_NAME.
    trans_node (Optional[bool]): Whether to transform node, defaults to None.
    num_workers (int): Number of worker threads, defaults to 0.
    display_name (Optional[str]): Display name.
    group_type (NodeGroupType): Node group type.
    **kwargs: Additional arguments.
"""
        if version is not None:
            if not _VERSION_RE.match(version):
                raise ValueError(f'Invalid version {version!r}. Must follow PEP 440 (e.g. 1.0, 1.1.1, 1.1.1a0).')
            name = f'{name}@v{version}'
        DocImpl._create_node_group_impl(cls, '_global_node_groups', name=name, transform=transform, parent=parent,
                                        trans_node=trans_node, num_workers=num_workers, display_name=display_name,
                                        group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)

`create_node_group(name, transform, parent=LAZY_ROOT_NAME, *, trans_node=None, num_workers=0, display_name=None, group_type=NodeGroupType.CHUNK, ref=None, version=None, lazy_mode=None, **kwargs)`

Create a local node group.

Parameters:

name (str) –

Node group name.
transform (Union[str, Callable]) –

Transform function or name.
parent (str, default: LAZY_ROOT_NAME ) –

Parent node group name, defaults to LAZY_ROOT_NAME.
trans_node (Optional[bool], default: None ) –

Whether to transform node, defaults to None.
num_workers (int, default: 0 ) –

Number of worker threads, defaults to 0.
display_name (Optional[str], default: None ) –

Display name.
group_type (NodeGroupType, default: CHUNK ) –

Node group type.
**kwargs –

Additional arguments.

Source code in lazyllm/tools/rag/doc_impl.py

    def create_node_group(self, name, transform: Union[str, Callable], parent: str = LAZY_ROOT_NAME, *,
                          trans_node: Optional[bool] = None, num_workers: int = 0, display_name: Optional[str] = None,
                          group_type: NodeGroupType = NodeGroupType.CHUNK, ref: str = None,
                          version: Optional[str] = None, lazy_mode: Optional[str] = None, **kwargs) -> None:
        """Create a local node group.

Args:
    name (str): Node group name.
    transform (Union[str, Callable]): Transform function or name.
    parent (str): Parent node group name, defaults to LAZY_ROOT_NAME.
    trans_node (Optional[bool]): Whether to transform node, defaults to None.
    num_workers (int): Number of worker threads, defaults to 0.
    display_name (Optional[str]): Display name.
    group_type (NodeGroupType): Node group type.
    **kwargs: Additional arguments.
"""
        # NOTE: if parent itself is versioned, pass the full versioned name (e.g. "chunks@v1.0");
        # the version param does NOT auto-append a version suffix to parent.
        if version is not None:
            if not _VERSION_RE.match(version):
                raise ValueError(f'Invalid version {version!r}. Must follow PEP 440 (e.g. 1.0, 1.1.1, 1.1.1a0).')
            name = f'{name}@v{version}'
        if self._lazy_init.flag:
            if isinstance(self._processor, DocumentProcessor):
                self._processor.register_new_node_group(name, dict(
                    transform=transform, parent=parent, trans_node=trans_node,
                    num_workers=num_workers, display_name=display_name,
                    group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs,
                ), algo_name=self._algo_name)
                # Also update local node_groups so in-process callers see the new group.
                DocImpl._create_node_group_impl(self, 'node_groups', name=name, transform=transform, parent=parent,
                                                trans_node=trans_node, num_workers=num_workers,
                                                display_name=display_name, group_type=group_type, ref=ref,
                                                lazy_mode=lazy_mode, **kwargs)
                return
            raise RuntimeError('Cannot add node group after document started in standalone mode')
        DocImpl._create_node_group_impl(self, 'node_groups', name=name, transform=transform, parent=parent,
                                        trans_node=trans_node, num_workers=num_workers, display_name=display_name,
                                        group_type=group_type, ref=ref, lazy_mode=lazy_mode, **kwargs)

`drop_algorithm()`

Delete the algorithm information registered in the document parsing service for the current document collection.

Source code in lazyllm/tools/rag/doc_impl.py

    def drop_algorithm(self):
        """Delete the algorithm information registered in the document parsing service for the current document collection.
"""
        if isinstance(self._processor, DocumentProcessor):
            self._processor.drop_algorithm(self._algo_name)
        else:
            raise ValueError('This method is only available when the Document has a DocumentProcessor')

`find(nodes, group)`

Find nodes in specified group.

Parameters:

nodes (List[DocNode]) –

List of nodes.
group (str) –

Target group name.

Returns:

List[DocNode]: List of found nodes.

Source code in lazyllm/tools/rag/doc_impl.py

    def find(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find nodes in specified group.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found nodes.
"""
        if len(nodes) == 0: return nodes
        self._lazy_init()

        def get_full_path(name: str) -> List[str]:
            path = [name]
            while name != LAZY_ROOT_NAME:
                name = self.node_groups[name]['parent']
                path.append(name)
            return list(reversed(path))

        cur_group = nodes[0]._group
        path_cur = get_full_path(cur_group)
        path_tgt = get_full_path(group)

        idx = 0
        for a, b in zip(path_cur, path_tgt):
            if a == b:
                idx += 1
            else:
                break
        parent_path = list(reversed(path_cur[idx - 1:-1]))
        child_path = path_tgt[idx:]

        for next_group in parent_path:
            if not nodes: break
            nodes = self.find_parent(nodes, next_group)

        for next_group in child_path:
            if not nodes: break
            nodes = self.find_children(nodes, next_group)

        if not nodes:
            LOG.warning(f'We can not find any nodes for group `{group}`, please check your input')
            return []
        return nodes

`find_children(nodes, group)`

Find child nodes.

Parameters:

nodes (List[DocNode]) –

List of nodes.
group (str) –

Target group name.

Returns:

List[DocNode]: List of found child nodes.

Source code in lazyllm/tools/rag/doc_impl.py

    def find_children(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find child nodes.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found child nodes.
"""
        if not nodes: return []
        kb_id = nodes[0].global_metadata.get(RAG_KB_ID, None)
        result = self._store.get_nodes(group=group, kb_id=kb_id, parent=[n._uid for n in nodes], display=True)
        if not result:
            LOG.warning(f'We cannot find any nodes for group `{group}`, please check your input.')
        LOG.debug(f'Found children nodes for {group}: {result}')
        return list(result)

`find_parent(nodes, group)`

Find parent nodes.

Parameters:

nodes (List[DocNode]) –

List of nodes.
group (str) –

Target group name.

Returns:

List[DocNode]: List of found parent nodes.

Source code in lazyllm/tools/rag/doc_impl.py

    def find_parent(self, nodes: List[DocNode], group: str) -> List[DocNode]:
        """Find parent nodes.

Args:
    nodes (List[DocNode]): List of nodes.
    group (str): Target group name.

**Returns:**

- List[DocNode]: List of found parent nodes.
"""
        if isinstance(nodes[0].parent, DocNode):
            result = self._find_parent_with_node(nodes, group)
        else:
            result = self._find_parent_with_uid(nodes, group)
        if not result:
            LOG.warning(
                f'We can not find any nodes for group `{group}`, please check your input'
            )
        LOG.debug(f'Found parent node for {group}: {result}')
        return result

`register_global_reader(pattern, func=None)` `classmethod`

Register a global file reader.

Parameters:

pattern (str) –

File pattern.
func (Optional[Callable], default: None ) –

Reader function, defaults to None.

Returns:

Optional[Callable]: Decorator function or None.

Source code in lazyllm/tools/rag/doc_impl.py

    @classmethod
    def register_global_reader(cls, pattern: str, func: Optional[Callable] = None):
        """Register a global file reader.

Args:
    pattern (str): File pattern.
    func (Optional[Callable]): Reader function, defaults to None.

**Returns:**

- Optional[Callable]: Decorator function or None.
"""
        if func is not None:
            cls._registered_file_reader[pattern] = func
            return None

        def decorator(klass):
            if callable(klass): cls._registered_file_reader[pattern] = klass
            else: raise TypeError(f'The registered object {klass} is not a callable object.')
            return klass
        return decorator

`register_index(index_type, index_cls, *args, **kwargs)`

Register an index.

Parameters:

index_type (str) –

Index type.
index_cls (IndexBase) –

Index class.
*args –

Positional arguments.
**kwargs –

Keyword arguments.

Source code in lazyllm/tools/rag/doc_impl.py

    def register_index(self, index_type: str, index_cls: IndexBase, *args, **kwargs) -> None:
        """Register an index.

Args:
    index_type (str): Index type.
    index_cls (IndexBase): Index class.
    *args: Positional arguments.
    **kwargs: Keyword arguments.
"""
        if bool(self._lazy_init.flag):
            args = [self._resolve_index_placeholder(arg) for arg in args]
            kwargs = {k: self._resolve_index_placeholder(v) for k, v in kwargs.items()}
            self._store.register_index(index_type, index_cls(*args, **kwargs))
        else:
            self._index_pending_registrations.append((index_type, index_cls, args, kwargs))

`retrieve(query, group_name, similarity, similarity_cut_off, index, topk, similarity_kws, embed_keys=None, filters=None, **kwargs)`

Retrieve document nodes.

Parameters:

query (str) –

Query string.
group_name (str) –

Node group name.
similarity (str) –

Similarity calculation method.
similarity_cut_off (Union[float, Dict[str, float]]) –

Similarity threshold.
index (str) –

Index type.
topk (int) –

Number of results to return.
similarity_kws (dict) –

Similarity calculation parameters.
embed_keys (Optional[List[str]], default: None ) –

List of embedding keys.
filters (Optional[Dict], default: None ) –

Filter conditions.
**kwargs –

Additional arguments.

Returns:

List[DocNode]: List of retrieved document nodes.

Source code in lazyllm/tools/rag/doc_impl.py

    def retrieve(self, query: str, group_name: str, similarity: str, similarity_cut_off: Union[float, Dict[str, float]],
                 index: str, topk: int, similarity_kws: dict, embed_keys: Optional[List[str]] = None,
                 filters: Optional[Dict[str, Union[str, int, List, Set]]] = None, **kwargs) -> List[DocNode]:
        """Retrieve document nodes.

Args:
    query (str): Query string.
    group_name (str): Node group name.
    similarity (str): Similarity calculation method.
    similarity_cut_off (Union[float, Dict[str, float]]): Similarity threshold.
    index (str): Index type.
    topk (int): Number of results to return.
    similarity_kws (dict): Similarity calculation parameters.
    embed_keys (Optional[List[str]]): List of embedding keys.
    filters (Optional[Dict]): Filter conditions.
    **kwargs: Additional arguments.

**Returns:**

- List[DocNode]: List of retrieved document nodes.
"""
        self._lazy_init()

        if index and index != 'default':
            query_instance = self._store.get_index(type=index)
            if query_instance is None:
                raise NotImplementedError(f'Index type "{index}" is not registered in the store.')
        else:
            query_instance = self._store.get_index(type='default') or self._store
        try:
            nodes = query_instance.query(query=query, group_name=group_name, similarity_name=similarity,
                                         similarity_cut_off=similarity_cut_off, topk=topk, embed_keys=embed_keys,
                                         filters=filters, **similarity_kws, **kwargs)
        except Exception as e:
            raise RuntimeError(f'index type `{index}` of store `{type(self._store._impl)}` query failed: {e}')

        for n in nodes:
            n._store = self._store
            n._node_groups = self.node_groups
            n._children_loaded = False
        return nodes

`lazyllm.tools.services.client.ClientBase`

Bases: object

Base client class for managing service connections and status conversions.

Parameters:

url (str) –

URL of the service endpoint.

Attributes:

url –

URL of the service endpoint.

Source code in lazyllm/tools/services/client.py

class ClientBase(object):
    """Base client class for managing service connections and status conversions.

Args:
    url (str): URL of the service endpoint.

Attributes:
    url: URL of the service endpoint.
"""
    def __init__(self, url):
        self.url = url

    def uniform_status(self, status):
        """Standardize task status string.

Args:
    status (str): Original status string.

**Returns:**

- str: Standardized status string, possible values include:
    - 'Invalid': Invalid status
    - 'Ready': Ready status
    - 'Done': Completed status
    - 'Cancelled': Cancelled status
    - 'Failed': Failed status
    - 'Running': Running status
    - 'Pending': Pending status (includes TBSubmitted, InQueue, Pending)
"""
        if status == 'Invalid':
            res = 'Invalid'
        elif status == 'Ready':
            res = 'Ready'
        elif Status[status] == Status.Done:
            res = 'Done'
        elif Status[status] == Status.Cancelled:
            res = 'Cancelled'
        elif Status[status] == Status.Failed:
            res = 'Failed'
        elif Status[status] == Status.Running:
            res = 'Running'
        else:  # TBSubmitted, InQueue, Pending
            res = 'Pending'
        return res

`uniform_status(status)`

Standardize task status string.

Parameters:

status (str) –

Original status string.

Returns:

str: Standardized status string, possible values include:
- 'Invalid': Invalid status
- 'Ready': Ready status
- 'Done': Completed status
- 'Cancelled': Cancelled status
- 'Failed': Failed status
- 'Running': Running status
- 'Pending': Pending status (includes TBSubmitted, InQueue, Pending)

Source code in lazyllm/tools/services/client.py

    def uniform_status(self, status):
        """Standardize task status string.

Args:
    status (str): Original status string.

**Returns:**

- str: Standardized status string, possible values include:
    - 'Invalid': Invalid status
    - 'Ready': Ready status
    - 'Done': Completed status
    - 'Cancelled': Cancelled status
    - 'Failed': Failed status
    - 'Running': Running status
    - 'Pending': Pending status (includes TBSubmitted, InQueue, Pending)
"""
        if status == 'Invalid':
            res = 'Invalid'
        elif status == 'Ready':
            res = 'Ready'
        elif Status[status] == Status.Done:
            res = 'Done'
        elif Status[status] == Status.Cancelled:
            res = 'Cancelled'
        elif Status[status] == Status.Failed:
            res = 'Failed'
        elif Status[status] == Status.Running:
            res = 'Running'
        else:  # TBSubmitted, InQueue, Pending
            res = 'Pending'
        return res

`lazyllm.tools.services.services.ServerBase`

Bases: object

Server base class, provides basic functionality for task management and status monitoring.

Implements multi-user task information storage, status polling check and thread-safe dictionary operations.

Source code in lazyllm/tools/services/services.py

class ServerBase(object):
    """Server base class, provides basic functionality for task management and status monitoring.

Implements multi-user task information storage, status polling check and thread-safe dictionary operations.

"""
    def __init__(self):
        self._user_job_info = {'default': dict()}
        self._active_jobs = dict()
        self._info_lock = threading.Lock()
        self._active_lock = threading.Lock()
        self._time_format = '%y%m%d%H%M%S%f'
        self._polling_thread = None

    def __call__(self):
        if not self._polling_thread:
            self._polling_status_checker()

    def __reduce__(self):
        return (self.__class__, ())

    def _update_dict(sef, lock, dicts, k1, k2=None, dict_value=None):
        with lock:
            if k1 not in dicts:
                dicts[k1] = {}
            if k2 is None:
                return
            if k2 not in dicts[k1]:
                dicts[k1][k2] = {}
            if dict_value is None:
                return
            if isinstance(dict_value, tuple):  # for self._active_jobs
                dicts[k1][k2] = dict_value
            elif isinstance(dict_value, dict):  # for self._user_job_info
                dicts[k1][k2].update(dict_value)
            else:
                raise RuntimeError('dict_value only supported: dict and tuple')

    def _read_dict(self, lock, dicts, k1=None, k2=None, vk=None, deepcopy=True):
        with lock:
            if k1 and k2 and vk:
                return copy.deepcopy(dicts[k1][k2][vk]) if deepcopy else dicts[k1][k2][vk]
            elif k1 and k2:
                return copy.deepcopy(dicts[k1][k2]) if deepcopy else dicts[k1][k2]
            elif k1:
                return copy.deepcopy(dicts[k1]) if deepcopy else dicts[k1]
            else:
                raise RuntimeError('At least specific k1.')

    def _in_dict(self, lock, dicts, k1, k2=None, vk=None):
        with lock:
            if k1 not in dicts:
                return False

            if k2 is not None:
                if k2 not in dicts[k1]:
                    return False
            else:
                return True

            if vk is not None:
                if vk not in dicts[k1][k2]:
                    return False
            return True

    def _pop_dict(self, lock, dicts, k1, k2=None, vk=None, default=None):
        with lock:
            if k1 not in dicts:
                return default

            if k2:
                if k2 not in dicts[k1]:
                    return default

                if vk:
                    return dicts[k1][k2].pop(vk, default)
                else:
                    return dicts[k1].pop(k2, default)

            return dicts.pop(k1, default)

    def _update_user_job_info(self, token, job_id=None, dict_value=None):
        self._update_dict(self._info_lock, self._user_job_info, token, job_id, dict_value)

    def _update_active_jobs(self, token, job_id=None, dict_value=None):
        self._update_dict(self._active_lock, self._active_jobs, token, job_id, dict_value)

    def _read_user_job_info(self, token, job_id=None, key=None):
        return self._read_dict(self._info_lock, self._user_job_info, token, job_id, key)

    def _read_active_job(self, token, job_id=None):
        return self._read_dict(self._active_lock, self._active_jobs, token, job_id, deepcopy=False)

    def _in_user_job_info(self, token, job_id=None, key=None):
        return self._in_dict(self._info_lock, self._user_job_info, token, job_id, key)

    def _in_active_jobs(self, token, job_id=None):
        return self._in_dict(self._active_lock, self._active_jobs, token, job_id)

    def _pop_user_job_info(self, token, job_id=None, key=None):
        return self._pop_dict(self._info_lock, self._user_job_info, token, job_id, key)

    def _pop_active_job(self, token, job_id=None):
        return self._pop_dict(self._active_lock, self._active_jobs, token, job_id)

    def _update_status(self, token, job_id): pass

    def _polling_status_checker(self, frequent=5):
        def polling():
            while True:
                # Thread-safe access to two-level keys
                with self._active_lock:
                    loop_items = [(token, job_id) for token in self._active_jobs.keys()
                                  for job_id in self._active_jobs[token]]
                # Update the status of all jobs in sequence
                for token, job_id in loop_items:
                    self._update_status(token, job_id)
                time.sleep(frequent)

        self._polling_thread = threading.Thread(target=polling)
        self._polling_thread.daemon = True
        self._polling_thread.start()

    async def authorize_current_user(self, Bearer: str = None):
        """User authentication and authorization.

Verify the validity of user token, ensure only authorized users can access related resources.

Args:
    Bearer: Bearer token string

Returns:
    str: Verified token

Raises:
    HTTPException: 401 exception when token is invalid
"""
        if not self._in_user_job_info(Bearer):
            raise fastapi.HTTPException(status_code=401, detail='Invalid token')
        return Bearer

`authorize_current_user(Bearer=None)` `async`

User authentication and authorization.

Verify the validity of user token, ensure only authorized users can access related resources.

Parameters:

Bearer (str, default: None ) –

Bearer token string

Returns:

str –

Verified token

Raises:

HTTPException –

401 exception when token is invalid

Source code in lazyllm/tools/services/services.py

    async def authorize_current_user(self, Bearer: str = None):
        """User authentication and authorization.

Verify the validity of user token, ensure only authorized users can access related resources.

Args:
    Bearer: Bearer token string

Returns:
    str: Verified token

Raises:
    HTTPException: 401 exception when token is invalid
"""
        if not self._in_user_job_info(Bearer):
            raise fastapi.HTTPException(status_code=401, detail='Invalid token')
        return Bearer

`lazyllm.tools.infer_service.serve.InferServer`

Bases: ServerBase

Inference service server class, inherits from ServerBase.

Provides RESTful API interfaces for model inference service creation, management, monitoring and log query.

Source code in lazyllm/tools/infer_service/serve.py

class InferServer(ServerBase):
    """Inference service server class, inherits from ServerBase.

Provides RESTful API interfaces for model inference service creation, management, monitoring and log query.

"""

    def _update_status(self, token, job_id):  # noqa: C901
        if not self._in_active_jobs(token, job_id):
            return
        # Get basic info
        info = self._read_user_job_info(token, job_id)

        # Get status
        m, _ = self._read_active_job(token, job_id)
        status = m.status().name
        log_path = info['log_path']

        update = {'status': status}

        # Some tasks not run when they are just created
        if Status[status] == Status.Running and not info['started_at']:
            update['started_at'] = datetime.now().strftime(self._time_format)

        # Ready to Infer
        if Status[status] == Status.Running and m._url:
            update['status'] = 'Ready'
            update['endpoint'] = m._url

        # Some tasks cannot obtain the storage path when they are just started
        if not log_path:
            save_root = os.path.join(lazyllm.config['infer_log_root'], token, job_id)
            os.makedirs(save_root, exist_ok=True)
            update['log_path'] = self._get_log_path(save_root)

        if Status[status] == Status.Cancelled:
            first_seen = info.get('first_cancelled_time')
            if not first_seen:
                update['first_cancelled_time'] = datetime.now().strftime(self._time_format)
                update['status'] = 'Pending'
            else:
                first_seen_time = datetime.strptime(first_seen, self._time_format)
                if (datetime.now() - first_seen_time).total_seconds() > 60:  # Observe for 60 seconds
                    ret = self._pop_active_job(token, job_id)
                    if ret is not None:
                        ret[0].stop()
                    if info['started_at'] and not info['cost']:
                        cost = (first_seen_time - datetime.strptime(info['started_at'],
                                                                    self._time_format)).total_seconds()
                        update['cost'] = cost
                    self._update_user_job_info(token, job_id, update)
                    return
                else:
                    # Still in the obsesrvation period, not cleaned up
                    update['status'] = 'Pending'
        else:
            # The status is restored, clear first_cancelled_time
            if 'first_cancelled_time' in info:
                update['first_cancelled_time'] = None

        # Update Status
        self._update_user_job_info(token, job_id, update)

        # Pop and kill jobs with status: Done, Failed
        if Status[status] in (Status.Done, Status.Failed):
            ret = self._pop_active_job(token, job_id)
            if ret is not None:
                ret[0].stop()
            if info['started_at'] and not info['cost']:
                cost = (datetime.now() - datetime.strptime(info['started_at'], self._time_format)).total_seconds()
                self._update_user_job_info(token, job_id, {'cost': cost})
            return

        create_time = datetime.strptime(info['created_at'], self._time_format)
        delta_time = (datetime.now() - create_time).total_seconds()

        # More than 5 min pop and kill jobs with status: Cancelled. Because of
        # some tasks have just been started and their status cannot be checked.
        if delta_time > 300 and Status[status] == Status.Cancelled:
            m, _ = self._pop_active_job(token, job_id)
            m.stop()
            if info['started_at'] and not info['cost']:
                cost = (datetime.now() - datetime.strptime(info['started_at'], self._time_format)).total_seconds()
                self._update_user_job_info(token, job_id, {'cost': cost})
            return

        # More than 50 min pop and kill jobs with status: TBSubmitted, InQueue, Pending
        if delta_time > 3000 and Status[status] in (Status.TBSubmitted, Status.InQueue, Status.Pending):
            m, _ = self._pop_active_job(token, job_id)
            m.stop()
            return

    def _get_log_path(self, log_dir):
        if not log_dir:
            return None

        log_files_paths = []
        for file in os.listdir(log_dir):
            if file.endswith('.log') and file.startswith('infer_'):
                log_files_paths.append(os.path.join(log_dir, file))
        if len(log_files_paths) == 0:
            return None
        if len(log_files_paths) == 1:
            return log_files_paths[-1]
        newest_file = None
        newest_time = 0
        for path in log_files_paths:
            mtime = os.path.getmtime(path)
            if mtime > newest_time:
                newest_time = mtime
                newest_file = path
        return newest_file

    @app.post('/v1/inference_services')
    async def create_job(self, job: _JobDescription, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Create inference task.

Create new model inference service based on job description, start deployment thread and initialize task status.

Args:
    job (JobDescription): Job description object
    token (str): User token

Returns:
    dict: Response containing job ID
"""
        if not self._in_user_job_info(token):
            self._update_user_job_info(token)
        if self._in_active_jobs(token, job.service_name):
            raise HTTPException(status_code=400, detail='Service name already exists')

        job_id = job.service_name
        create_time = datetime.now().strftime(self._time_format)

        # Build checkpoint save dir:
        # - No-Env-Set: (work/path + infer_log) + token + job_id;
        # - Env-Set:    (infer_log_root)     + token + job_id;
        save_root = os.path.join(lazyllm.config['infer_log_root'], token, job_id)
        os.makedirs(save_root, exist_ok=True)
        # wait 5 minutes for launch cmd
        hypram = dict(launcher=lazyllm.launchers.remote(sync=False, ngpus=job.num_gpus, retry=30),
                      log_path=save_root, tp=job.num_gpus)
        m = lazyllm.TrainableModule(job.model_name, use_model_map=False)\
            .deploy_method((getattr(lazyllm.deploy, job.framework), hypram))

        # Launch Deploy:
        thread = threading.Thread(target=m.start)
        thread.start()

        # Sleep 5s for launch cmd.
        try:
            async with async_timeout.timeout(5):
                while m.status() == Status.Cancelled:
                    await asyncio.sleep(1)
        except asyncio.TimeoutError:
            pass

        log_path = self._get_log_path(save_root)

        # Save status
        status = m.status().name
        if Status[status] == Status.Running:
            started_time = datetime.now().strftime(self._time_format)
        else:
            started_time = None
        if Status[status] == Status.Cancelled:
            first_seen = datetime.now().strftime(self._time_format)
            status = 'Pending'
        else:
            first_seen = None
        self._update_active_jobs(token, job_id, (m, thread))
        self._update_user_job_info(token, job_id, {
            'lwsName': job_id,
            'status': status,
            'endpoint': 'unknown',
            'service_name': job.service_name,
            'model_name': job.model_name,
            'created_at': create_time,
            'hyperparameters': hypram,
            'started_at': started_time,
            'log_path': log_path,
            'cost': None,
            'deploy_method': m._deploy_type.__name__,
            'first_cancelled_time': first_seen,
        })

        return {'lwsName': job_id}

    @app.delete('/v1/inference_services/{job_id}')
    async def cancel_job(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Cancel inference task.

Stop specified inference task, clean up resources and update task status.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Response containing task status
"""
        await self.authorize_current_user(token)
        if not self._in_active_jobs(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        m, _ = self._pop_active_job(token, job_id)
        info = self._read_user_job_info(token, job_id)
        m.stop()

        total_sleep = 0
        while m.status() != Status.Cancelled:
            time.sleep(1)
            total_sleep += 1
            if total_sleep > 10:
                raise HTTPException(status_code=404, detail=f'Task {job_id}, cancelled timed out.')

        status = m.status().name
        update_dict = {'status': status}
        if info['started_at'] and not info['cost']:
            update_dict['cost'] = (datetime.now() - datetime.strptime(info['started_at'],
                                                                      self._time_format)).total_seconds()
        self._update_user_job_info(token, job_id, update_dict)

        return {'status': status}

    @app.get('/v1/inference_services')
    async def list_jobs(self, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """List all inference tasks.

Get all inference tasks list for current user.

Args:
    token (str): User token

Returns:
    dict: Task list information
"""
        if not self._in_user_job_info(token):
            self._update_user_job_info(token)
        server_running_dict = self._read_user_job_info(token)
        return server_running_dict

    @app.get('/v1/inference_services/{job_id}')
    async def get_job_info(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Get task detailed information.

Query detailed information of specified task, including status, endpoint, cost time, etc.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Task detailed information
"""
        await self.authorize_current_user(token)
        if not self._in_user_job_info(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        self._update_status(token, job_id)

        return self._read_user_job_info(token, job_id)

    @app.get('/v1/inference_services/{job_id}/events')
    async def get_job_log(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Get task log.

Get log file path or log content of specified task.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Log information
"""
        await self.authorize_current_user(token)
        if not self._in_user_job_info(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        self._update_status(token, job_id)
        info = self._read_user_job_info(token, job_id)

        if info['log_path']:
            return {'log': info['log_path']}
        else:
            return {'log': 'invalid'}

`cancel_job(job_id, token=Header(DEFAULT_TOKEN))` `async`

Cancel inference task.

Stop specified inference task, clean up resources and update task status.

Parameters:

job_id (str) –

Job ID
token (str, default: Header(DEFAULT_TOKEN) ) –

User token

Returns:

dict –

Response containing task status

Source code in lazyllm/tools/infer_service/serve.py

    @app.delete('/v1/inference_services/{job_id}')
    async def cancel_job(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Cancel inference task.

Stop specified inference task, clean up resources and update task status.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Response containing task status
"""
        await self.authorize_current_user(token)
        if not self._in_active_jobs(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        m, _ = self._pop_active_job(token, job_id)
        info = self._read_user_job_info(token, job_id)
        m.stop()

        total_sleep = 0
        while m.status() != Status.Cancelled:
            time.sleep(1)
            total_sleep += 1
            if total_sleep > 10:
                raise HTTPException(status_code=404, detail=f'Task {job_id}, cancelled timed out.')

        status = m.status().name
        update_dict = {'status': status}
        if info['started_at'] and not info['cost']:
            update_dict['cost'] = (datetime.now() - datetime.strptime(info['started_at'],
                                                                      self._time_format)).total_seconds()
        self._update_user_job_info(token, job_id, update_dict)

        return {'status': status}

`create_job(job, token=Header(DEFAULT_TOKEN))` `async`

Create inference task.

Create new model inference service based on job description, start deployment thread and initialize task status.

Parameters:

job (JobDescription) –

Job description object
token (str, default: Header(DEFAULT_TOKEN) ) –

User token

Returns:

dict –

Response containing job ID

Source code in lazyllm/tools/infer_service/serve.py

    @app.post('/v1/inference_services')
    async def create_job(self, job: _JobDescription, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Create inference task.

Create new model inference service based on job description, start deployment thread and initialize task status.

Args:
    job (JobDescription): Job description object
    token (str): User token

Returns:
    dict: Response containing job ID
"""
        if not self._in_user_job_info(token):
            self._update_user_job_info(token)
        if self._in_active_jobs(token, job.service_name):
            raise HTTPException(status_code=400, detail='Service name already exists')

        job_id = job.service_name
        create_time = datetime.now().strftime(self._time_format)

        # Build checkpoint save dir:
        # - No-Env-Set: (work/path + infer_log) + token + job_id;
        # - Env-Set:    (infer_log_root)     + token + job_id;
        save_root = os.path.join(lazyllm.config['infer_log_root'], token, job_id)
        os.makedirs(save_root, exist_ok=True)
        # wait 5 minutes for launch cmd
        hypram = dict(launcher=lazyllm.launchers.remote(sync=False, ngpus=job.num_gpus, retry=30),
                      log_path=save_root, tp=job.num_gpus)
        m = lazyllm.TrainableModule(job.model_name, use_model_map=False)\
            .deploy_method((getattr(lazyllm.deploy, job.framework), hypram))

        # Launch Deploy:
        thread = threading.Thread(target=m.start)
        thread.start()

        # Sleep 5s for launch cmd.
        try:
            async with async_timeout.timeout(5):
                while m.status() == Status.Cancelled:
                    await asyncio.sleep(1)
        except asyncio.TimeoutError:
            pass

        log_path = self._get_log_path(save_root)

        # Save status
        status = m.status().name
        if Status[status] == Status.Running:
            started_time = datetime.now().strftime(self._time_format)
        else:
            started_time = None
        if Status[status] == Status.Cancelled:
            first_seen = datetime.now().strftime(self._time_format)
            status = 'Pending'
        else:
            first_seen = None
        self._update_active_jobs(token, job_id, (m, thread))
        self._update_user_job_info(token, job_id, {
            'lwsName': job_id,
            'status': status,
            'endpoint': 'unknown',
            'service_name': job.service_name,
            'model_name': job.model_name,
            'created_at': create_time,
            'hyperparameters': hypram,
            'started_at': started_time,
            'log_path': log_path,
            'cost': None,
            'deploy_method': m._deploy_type.__name__,
            'first_cancelled_time': first_seen,
        })

        return {'lwsName': job_id}

`get_job_info(job_id, token=Header(DEFAULT_TOKEN))` `async`

Get task detailed information.

Query detailed information of specified task, including status, endpoint, cost time, etc.

Parameters:

job_id (str) –

Job ID
token (str, default: Header(DEFAULT_TOKEN) ) –

User token

Returns:

dict –

Task detailed information

Source code in lazyllm/tools/infer_service/serve.py

    @app.get('/v1/inference_services/{job_id}')
    async def get_job_info(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Get task detailed information.

Query detailed information of specified task, including status, endpoint, cost time, etc.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Task detailed information
"""
        await self.authorize_current_user(token)
        if not self._in_user_job_info(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        self._update_status(token, job_id)

        return self._read_user_job_info(token, job_id)

`get_job_log(job_id, token=Header(DEFAULT_TOKEN))` `async`

Get task log.

Get log file path or log content of specified task.

Parameters:

job_id (str) –

Job ID
token (str, default: Header(DEFAULT_TOKEN) ) –

User token

Returns:

dict –

Log information

Source code in lazyllm/tools/infer_service/serve.py

    @app.get('/v1/inference_services/{job_id}/events')
    async def get_job_log(self, job_id: str, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """Get task log.

Get log file path or log content of specified task.

Args:
    job_id (str): Job ID
    token (str): User token

Returns:
    dict: Log information
"""
        await self.authorize_current_user(token)
        if not self._in_user_job_info(token, job_id):
            raise HTTPException(status_code=404, detail='Job not found')

        self._update_status(token, job_id)
        info = self._read_user_job_info(token, job_id)

        if info['log_path']:
            return {'log': info['log_path']}
        else:
            return {'log': 'invalid'}

`list_jobs(token=Header(DEFAULT_TOKEN))` `async`

List all inference tasks.

Get all inference tasks list for current user.

Parameters:

token (str, default: Header(DEFAULT_TOKEN) ) –

User token

Returns:

dict –

Task list information

Source code in lazyllm/tools/infer_service/serve.py

    @app.get('/v1/inference_services')
    async def list_jobs(self, token: str = Header(DEFAULT_TOKEN)):  # noqa B008
        """List all inference tasks.

Get all inference tasks list for current user.

Args:
    token (str): User token

Returns:
    dict: Task list information
"""
        if not self._in_user_job_info(token):
            self._update_user_job_info(token)
        server_running_dict = self._read_user_job_info(token)
        return server_running_dict

`lazyllm.tools.rag.store.hybrid.sensecore_store.SenseCoreStore`

Bases: LazyLLMStoreBase

SenseCore hybrid storage implementation, inheriting from LazyLLMStoreBase, providing document storage and retrieval functionality based on the SenseCore platform. This class supports document serialization storage, multimodal content processing, hybrid search, and other features, implementing efficient document management through S3 storage and SenseCore API.

Key Features

Supports full storage capabilities (StoreCapability.ALL), including insert, delete, query, search operations.
Automatically handles image content, uploading local images to S3 storage and generating access links.
Supports multimodal search, including text and image hybrid queries.
Provides document serialization and deserialization functionality, supporting complex data structure storage.
Supports batch operations and asynchronous task processing for improved storage efficiency.
Integrates S3 storage and SenseCore API for cloud-based document management.

Parameters:

uri (str, default: '' ) –

SenseCore service API address, defaults to empty string.
**kwargs –

Additional configuration parameters, including s3_config and image_url_config.

Configuration Parameters

s3_config (dict): S3 storage configuration, including bucket_name, access_key, secret_access_key, etc. image_url_config (dict): Image URL generation configuration for multimodal search.

Source code in lazyllm/tools/rag/store/hybrid/sensecore_store.py

class SenseCoreStore(LazyLLMStoreBase):
    """SenseCore hybrid storage implementation, inheriting from LazyLLMStoreBase, providing document storage and retrieval functionality based on the SenseCore platform.
This class supports document serialization storage, multimodal content processing, hybrid search, and other features, implementing efficient document management through S3 storage and SenseCore API.

Key Features:
    - Supports full storage capabilities (StoreCapability.ALL), including insert, delete, query, search operations.
    - Automatically handles image content, uploading local images to S3 storage and generating access links.
    - Supports multimodal search, including text and image hybrid queries.
    - Provides document serialization and deserialization functionality, supporting complex data structure storage.
    - Supports batch operations and asynchronous task processing for improved storage efficiency.
    - Integrates S3 storage and SenseCore API for cloud-based document management.

Args:
    uri (str): SenseCore service API address, defaults to empty string.
    **kwargs: Additional configuration parameters, including s3_config and image_url_config.

Configuration Parameters:
    s3_config (dict): S3 storage configuration, including bucket_name, access_key, secret_access_key, etc.
    image_url_config (dict): Image URL generation configuration for multimodal search.

"""
    capability = StoreCapability.ALL
    need_embedding = False
    supports_index_registration = False

    def __init__(self, uri: str = '', **kwargs):
        self._uri = uri
        self._s3_config = kwargs.get('s3_config')
        self._image_url_config = kwargs.get('image_url_config')
        self._uploaded_image_keys = set()
        self._path_prefix = kwargs.get('path_prefix')
        if not self._path_prefix:
            try:
                self._path_prefix = config['image_path_prefix']
            except Exception:
                self._path_prefix = os.getenv('RAG_IMAGE_PATH_PREFIX', '')

    @property
    def dir(self):
        return None

    @override
    def connect(self, global_metadata_desc: Optional[Dict[str, GlobalMetadataDesc]] = None, **kwargs) -> None:
        self._check_s3()
        self._global_metadata_desc = global_metadata_desc or {}
        LOG.info(f'[SenseCore Store - connect] connected to {self._uri}')

    def _check_s3(self):
        obj_key = 'lazyllm/warmup.txt'
        upload_data_to_s3('warmup', bucket_name=self._s3_config['bucket_name'], object_key=obj_key,
                          aws_access_key_id=self._s3_config['access_key'],
                          aws_secret_access_key=self._s3_config['secret_access_key'],
                          use_minio=self._s3_config['use_minio'], endpoint_url=self._s3_config['endpoint_url'])
        LOG.info(f'[SenseCore Store - check_s3] uploaded warmup.txt to {self._s3_config["bucket_name"]}')
        return

    def _upload_image_if_needed(self, file_path: str, obj_key: str):
        if obj_key in self._uploaded_image_keys:
            return

        with open(file_path, 'rb') as f:
            upload_data_to_s3(
                f.read(),
                bucket_name=self._s3_config['bucket_name'],
                object_key=obj_key,
                aws_access_key_id=self._s3_config['access_key'],
                aws_secret_access_key=self._s3_config['secret_access_key'],
                use_minio=self._s3_config['use_minio'],
                endpoint_url=self._s3_config['endpoint_url']
            )

        self._uploaded_image_keys.add(obj_key)

    def _serialize_data(self, data: dict) -> Dict:  # noqa: C901
        data = dict(data)
        content = json.dumps(data.get('content', ''), ensure_ascii=False)
        matches = IMAGE_PATTERN.findall(content)
        doc_id = data.get('doc_id', '')
        kb_id = data.get(RAG_KB_ID, DEFAULT_KB_ID)
        for _, image_path in matches:
            if image_path.startswith('lazyllm'):
                continue
            image_file_name = os.path.basename(image_path)
            obj_key = f'lazyllm/images/{kb_id}/{doc_id}/{image_file_name}'
            file_path = create_file_path(path=image_path, prefix=self._path_prefix)
            try:
                self._upload_image_if_needed(file_path, obj_key)
                content = content.replace(image_path, obj_key)
            except FileNotFoundError:
                LOG.error(f'Cannot find image path: {image_path} (local path {file_path}), skip...')
            except Exception as e:
                LOG.error(f'Error when uploading `{image_path}` {e!r}')
        data['content'] = json.loads(content)

        # special requirement: item called `table_image_map` in metadata, need to upload to s3
        if data.get('meta', {}).get('table_image_map', {}):
            for k, md_info in data['meta']['table_image_map'].items():
                matches = IMAGE_PATTERN.findall(md_info)
                if not matches:
                    continue
                image_path = matches[0][1]
                if image_path.startswith('lazyllm'):
                    continue
                image_name = os.path.basename(image_path)
                obj_key = f'lazyllm/images/{kb_id}/{doc_id}/{image_name}'
                file_path = create_file_path(path=image_path, prefix=self._path_prefix)
                try:
                    self._upload_image_if_needed(file_path, obj_key)
                    md_info = md_info.replace(image_path, obj_key)
                    data['meta']['table_image_map'][k] = md_info
                except FileNotFoundError:
                    LOG.error(f'Cannot find image: {image_path} (local path {file_path}, obj key {obj_key}), skip...')
                except Exception as e:
                    LOG.error(f'Error when uploading `{image_path}` (local path {file_path}, obj key {obj_key}) {e!r}')

        if data.get('group') == LAZY_ROOT_NAME:
            obj_key = f"lazyllm/lazyllm_root/{kb_id}/{doc_id}/{data.get('uid')}.json"
            upload_data_to_s3(content.encode('utf-8'), bucket_name=self._s3_config['bucket_name'], object_key=obj_key,
                              aws_access_key_id=self._s3_config['access_key'],
                              aws_secret_access_key=self._s3_config['secret_access_key'],
                              use_minio=self._s3_config['use_minio'], endpoint_url=self._s3_config['endpoint_url'])
            data['content'] = obj_key

        segment = Segment(segment_id=data.get('uid', ''), dataset_id=data.get(RAG_KB_ID, ''),
                          document_id=data.get('doc_id', ''), group=data.get('group', ''),
                          content=data.get('content', ''), meta=json.dumps(data.get('meta', {}), ensure_ascii=False),
                          excluded_embed_metadata_keys=data.get('excluded_embed_metadata_keys', []),
                          excluded_llm_metadata_keys=data.get('excluded_llm_metadata_keys', []),
                          parent=data.get('parent', ''),
                          global_meta=json.dumps(data.get('global_meta', {}), ensure_ascii=False),
                          answer=data.get('answer', ''), number=data.get('number', 0))
        if len(data.get('copy_source', {})):
            segment.copy_source = {
                'dataset_id': data.get('copy_source', {}).get(RAG_KB_ID, DEFAULT_KB_ID),
                'document_id': data.get('copy_source', {}).get(RAG_DOC_ID, ''),
                'segment_id': data.get('copy_source', {}).get('uid', '')
            }
        # image extract
        if isinstance(segment.content, str):
            target = segment.content
        else:
            target = json.dumps(segment.content)
        matches = IMAGE_PATTERN.findall(target)
        for _, image_path in matches:
            segment.image_keys.append(image_path)

        if data.get('type') == SegmentType.IMAGE.value and data.get('image_keys'):
            image_path = data.get('image_keys', [])[0]
            image_file_name = os.path.basename(image_path)
            obj_key = f'lazyllm/images/{kb_id}/{doc_id}/{image_file_name}'
            try:
                self._upload_image_if_needed(image_path, obj_key)
                segment.image_keys = [obj_key]
            except FileNotFoundError:
                LOG.error(f'Cannot find image path: {image_path} (local path {image_path}), skip...')
            except Exception as e:
                LOG.error(f'Error when uploading `{image_path}` {e!r}')
        elif data.get('type') == SegmentType.QA.value and data.get('answer'):
            answer = data.get('answer')
            matches = IMAGE_PATTERN.findall(answer)
            for _, image_path in matches:
                if image_path.startswith('lazyllm'):
                    continue
                image_file_name = os.path.basename(image_path)
                obj_key = f'lazyllm/images/{kb_id}/{doc_id}/{image_file_name}'
                file_path = create_file_path(path=image_path, prefix=self._path_prefix)
                try:
                    self._upload_image_if_needed(file_path, obj_key)
                    answer = answer.replace(image_path, obj_key)
                except FileNotFoundError:
                    LOG.error(f'Cannot find image path: {image_path} (local path {file_path}), skip...')
                except Exception as e:
                    LOG.error(f'Error when uploading `{image_path}` {e!r}')
            data['answer'] = answer
            matches = IMAGE_PATTERN.findall(data['answer'])
            for _, image_path in matches:
                segment.image_keys.append(image_path)
            segment.answer = data['answer']
        return segment.model_dump()

    def _deserialize_data(self, segment: Dict, display: bool = False) -> Dict:
        data = {
            'uid': segment.get('segment_id', ''),
            'doc_id': segment.get('document_id', ''),
            'group': segment.get('group', ''),
            'content': segment.get('content', '') if not display else segment.get('display_content'),
            'meta': json.loads(segment.get('meta', '{}')),
            'global_meta': json.loads(segment.get('global_meta', '{}')),
            'number': segment.get('number', 0),
            'kb_id': segment.get('dataset_id', ''),
            'excluded_embed_metadata_keys': segment.get('excluded_embed_metadata_keys', []),
            'excluded_llm_metadata_keys': segment.get('excluded_llm_metadata_keys', []),
            'parent': segment.get('parent', ''),
            'answer': segment.get('answer', ''),
            'image_keys': segment.get('image_keys', []),
        }
        if len(data.get('answer', '')):
            data['type'] = SegmentType.QA.value
        else:
            data['type'] = SegmentType.TEXT.value
        if data.get('group') == LAZY_ROOT_NAME and data.get('content').startswith('lazyllm/lazyllm_root/'):
            obj_key = data.get('content')
            content = download_data_from_s3(bucket_name=self._s3_config['bucket_name'], object_key=obj_key,
                                            aws_access_key_id=self._s3_config['access_key'],
                                            aws_secret_access_key=self._s3_config['secret_access_key'],
                                            use_minio=self._s3_config['use_minio'],
                                            endpoint_url=self._s3_config['endpoint_url'], encoding='utf-8')
            data['content'] = json.loads(content)

        if display and data.get('meta', {}).get('table_image_map', {}):
            for k, v in data['meta']['table_image_map'].items():
                matches = IMAGE_PATTERN.findall(v)
                if not matches:
                    continue
                image_path = matches[0][1]
                if not image_path.startswith('lazyllm'):
                    LOG.warning(f'[SenseCore Store]: table_image value must start with lazyllm, value: {image_path}')
                    continue
                url = presign_obj_from_s3(
                    bucket_name=self._s3_config['bucket_name'],
                    object_key=image_path,
                    aws_access_key_id=self._s3_config['access_key'],
                    aws_secret_access_key=self._s3_config['secret_access_key'],
                    endpoint_url=self._s3_config.get('external_endpoint_url', self._s3_config['endpoint_url']),
                    region_name=self._s3_config.get('region_name', 'us-east-1'),
                    client_method='get_object',
                    expires_in=PRESIGN_EXPIRE_TIME,
                )
                data['meta']['table_image_map'][k] = v.replace(image_path, url)
        return data

    def _create_filters_str(self, filters: Dict[str, Union[str, int, List, Set]]) -> str:
        ret_str = ''
        for name, candidates in filters.items():
            desc = self._global_metadata_desc.get(name)
            if not desc:
                raise ValueError(f'cannot find desc of field [{name}]')
            key = name
            if isinstance(candidates, str):
                candidates = [candidates]
            if (not isinstance(candidates, list)) and (not isinstance(candidates, set)):
                candidates = list(candidates)
            if desc.data_type == DataType.ARRAY:
                ret_str += f'array_contains_any({key}, {candidates}) and '
            else:
                ret_str += f'{key} in {candidates} and '

        if len(ret_str) > 0:
            return ret_str[:-5]  # truncate the last ' and '
        return ret_str

    def _upload_data_and_insert(self, data: List[dict], job_type: str = 'insert') -> str:
        try:
            job_id = str(uuid.uuid4())
            groups = set()
            for item in data:
                groups.add(item.get('group'))
            groups = list(groups)
            data = [self._serialize_data(item) for item in data]
            dataset_id = None
            for item in data:
                dataset_id = item.get('dataset_id', None)
                break
            if not dataset_id:
                raise ValueError('dataset_id is required in SenseCoreStore')

            obj_key = f'lazyllm/segments/{job_id}.jsonl'

            upload_data_to_s3(data=data, bucket_name=self._s3_config['bucket_name'], object_key=obj_key,
                              aws_access_key_id=self._s3_config['access_key'],
                              aws_secret_access_key=self._s3_config['secret_access_key'],
                              use_minio=self._s3_config['use_minio'], endpoint_url=self._s3_config['endpoint_url'])
            url = urljoin(self._uri, 'v1/writerSegmentJob:submit')
            params = {'writer_segment_job_id': job_id}
            headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
            payload = {'dataset_id': dataset_id or self._kb_id, 'file_key': obj_key,
                       'groups': groups, 'job_type': job_type}

            response = requests.post(url, params=params, headers=headers, json=payload)
            response.raise_for_status()
            LOG.info(f'SenseCore Store: insert task {job_id} submitted, payload:{payload}')
        except Exception as e:
            LOG.error(f'SenseCore Store: insert task {job_id} failed: {e}')
            raise e
        return job_id

    def _check_insert_job_status(self, job_id: str) -> None:
        url = urljoin(self._uri, f'v1/writerSegmentJobs/{job_id}')
        headers = {'Accept': 'application/json'}
        check_start_time = time.time()
        flag = False
        for wait_time in fibonacci_backoff(max_retries=16):
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            status = response.json()['state']
            if status == 2:
                flag = True
                break
            elif status == 3:
                break
            else:
                time.sleep(wait_time)
        check_end_time = time.time()
        if not flag:
            LOG.error(f'SenseCore Store: insert task {job_id} failed after {check_end_time - check_start_time}s')
            raise Exception(f'Insert task {job_id} failed after {check_end_time - check_start_time}s')
        LOG.info(f'SenseCore Store: insert task {job_id} finished after {check_end_time - check_start_time}s')
        return

    def _get_group_name(self, collection_name: str) -> str:
        return collection_name.split('_')[-1] if 'lazyllm_root' not in collection_name else 'lazyllm_root'

    @override
    def upsert(self, collection_name: str, data: List[dict]) -> bool:
        if not data: return True
        try:
            upsert_start_time = time.time()
            job_type = 'insert' if not len(data[0].get('copy_source', {})) else 'copy'
            with pipeline() as insert_ppl:
                insert_ppl.get_ids = warp(self._upload_data_and_insert).aslist
                insert_ppl.check_status = warp(self._check_insert_job_status)

            batched_data = [
                package(data[i:i + INSERT_BATCH_SIZE], job_type) for i in range(0, len(data), INSERT_BATCH_SIZE)
            ]
            insert_ppl(batched_data)
            upsert_end_time = time.time()
            LOG.info(f'[SenseCore Store - upsert] Upsert done! collection_name:{collection_name}, '
                     f'Time:{upsert_end_time - upsert_start_time}s')
            return True
        except Exception as e:
            LOG.error(f'[SenseCore Store - upsert] insert task failed: {e}')
            return False

    @override
    def delete(self, collection_name: str, criteria: dict, **kwargs) -> bool:
        try:
            url = urljoin(self._uri, 'v1/segments:bulkDelete')
            headers = {'Accept': '*/*', 'Content-Type': 'application/json'}
            doc_ids = criteria.get(RAG_DOC_ID)
            if doc_ids:
                payload = {'dataset_id': criteria.get(RAG_KB_ID), 'document_ids': doc_ids}
            else:
                payload = {'dataset_id': criteria.get(RAG_KB_ID), 'segment_ids': criteria.get('uid')}
            if collection_name:
                payload['group'] = self._get_group_name(collection_name)
            response = requests.post(url, headers=headers, json=payload)
            response.raise_for_status()
        except Exception as e:
            LOG.error(f'[SenseCore Store - delete] task col: {collection_name}\ncriteria: {criteria}\n{e}')
            return True
        return True

    @override
    def get(self, collection_name: str, criteria: dict, **kwargs) -> List[dict]:  # noqa: C901
        try:
            uids = criteria.get('uid')
            doc_ids = criteria.get(RAG_DOC_ID)
            kb_id = criteria.get(RAG_KB_ID, DEFAULT_KB_ID)
            if not (uids or collection_name):
                raise ValueError('group or uids must be provided')
            if doc_ids and len(doc_ids) > 1:
                raise ValueError('[Sensecore Store - get]: doc_ids must be a single value')
            doc_id = doc_ids[0] if doc_ids else None
            if doc_id and not uids:
                url = urljoin(self._uri, f'v1/datasets/{kb_id}/documents/{doc_id}/segments:search')
            else:
                url = urljoin(self._uri, 'v1/segments:scroll')
            headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
            payload = {'dataset_id': kb_id}
            if collection_name:
                payload['group'] = self._get_group_name(collection_name)
            if doc_id:
                payload['document_id'] = doc_id
            if uids:
                payload['segment_ids'] = uids
            else:
                payload['page_size'] = 1000
            # TODO(chenjiahao): sensecore store itself does not support parent/number index yet
            if criteria.get('parent'):
                payload['parent'] = criteria.get('parent')
            if criteria.get('number'):
                payload['number'] = criteria.get('number')
            segments = []
            while True:
                response = requests.post(url, headers=headers, json=payload)
                if response.status_code != 200:
                    LOG.warning(f'SenseCore Store: get task failed: url {url}, data: {payload}, e:{response.text}')
                    break
                data = response.json()
                batch = data.get('segments', [])
                if not batch:
                    break
                segments.extend(batch)
                next_page_token = data.get('next_page_token')
                if not next_page_token:
                    break
                payload['page_token'] = next_page_token
            if doc_ids:
                segments = [segment for segment in segments if segment['document_id'] in doc_ids]
            return [self._deserialize_data(s, display=kwargs.get('display', False)) for s in segments]
        except Exception as e:
            LOG.error(f'[SenseCore Store - get]:task failed: {e}')
            return []

    def _multi_modal_process(self, query: str, images: List[str]):
        urls = []
        s3 = boto3.client('s3', aws_access_key_id=self._image_url_config['access_key'],
                          aws_secret_access_key=self._image_url_config['secret_access_key'],
                          endpoint_url=self._image_url_config['endpoint_url'])
        for image in images:
            query = query + '<image>\n'
            url = s3.generate_presigned_url(ClientMethod='get_object',
                                            Params={'Bucket': self._image_url_config['bucket_name'], 'Key': image},
                                            ExpiresIn=3600)
            urls.append(url)
        return query, urls

    @override
    def search(self, collection_name: str, query: Union[str, dict, List[float]], topk: int,  # noqa: C901
               filters: Optional[Dict[str, Union[str, int, List, Set]]] = None,
               embed_key: Optional[str] = None, **kwargs) -> List[dict]:
        try:
            if not embed_key:
                raise ValueError('[Sensecore Store] Query: embed_key must be provided')
            url = urljoin(self._uri, 'v1/segments:hybrid')
            headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}

            filter_str = self._create_filters_str(filters) if filters else None
            dataset_ids = []
            if filters:
                for name, candidates in filters.items():
                    desc = self._global_metadata_desc.get(name)
                    if not desc:
                        raise ValueError(f'cannot find desc of field [{name}]')
                    key = name
                    if key == RAG_KB_ID:
                        if isinstance(candidates, str):
                            candidates = [candidates]
                        if (not isinstance(candidates, list)) and (not isinstance(candidates, set)):
                            candidates = list(candidates)
                        dataset_ids = candidates
                        break
            if dataset_ids:
                hybrid_search_datasets = [{'dataset_id': dataset_id} for dataset_id in dataset_ids]
            else:
                LOG.error(f'SenseCore Store: no dataset_id provided, please check your filters: {filters}')
                return []

            images = kwargs.get('images', [])
            if images:
                query, images = self._multi_modal_process(query, images)
            payload = {'query': query, 'hybrid_search_datasets': hybrid_search_datasets, 'hybrid_search_type': 2,
                       'top_k': topk, 'filters': filter_str, 'group': self._get_group_name(collection_name),
                       'embedding_model': embed_key, 'images': images}
            response = requests.post(url, headers=headers, json=payload, timeout=60)
            response.raise_for_status()
            segments = response.json()['segments']
            segments = [s for s in segments if s.get('is_active', True)]
            return [self._deserialize_data(s, display=True) for s in segments]
        except Exception as e:
            LOG.error(f'SenseCore Store: query task failed: {e}')
            raise e

`lazyllm.tools.review.tools.chinese_corrector.get_errors(corrected_text, origin_text)`

Compare corrected text with original text to find error locations and contents.

Uses sequence matching algorithm to compare differences between two texts, returns a list of errors, each containing original character, corrected character, and position information.

Parameters:

corrected_text (str) –

The corrected text.
origin_text (str) –

The original text.

Returns:

list –

List of errors, each element is a tuple (orig_char, corr_char, pos) where: - orig_char (str): Original character, empty string if insertion error. - corr_char (str): Corrected character, empty string if deletion error. - pos (int): Position of error in original text.

Examples:

>>> from lazyllm.tools.review.tools.chinese_corrector import get_errors
>>> errors = get_errors("我喜欢编程", "我喜欢编程成")
>>> print(errors)
[('', '成', 6)]

Source code in lazyllm/tools/review/tools/chinese_corrector.py

def get_errors(corrected_text, origin_text):  # noqa: C901
    """Compare corrected text with original text to find error locations and contents.

Uses sequence matching algorithm to compare differences between two texts, returns a list of errors,
each containing original character, corrected character, and position information.

Args:
    corrected_text (str): The corrected text.
    origin_text (str): The original text.

Returns:
    list: List of errors, each element is a tuple (orig_char, corr_char, pos) where:
        - orig_char (str): Original character, empty string if insertion error.
        - corr_char (str): Corrected character, empty string if deletion error.
        - pos (int): Position of error in original text.


Examples:
        >>> from lazyllm.tools.review.tools.chinese_corrector import get_errors
        >>> errors = get_errors("我喜欢编程", "我喜欢编程成")
        >>> print(errors)
        [('', '成', 6)]
    """
    errors = []
    unk_tokens = set([' ', '“', '”', '‘', '’', '琊', '\n', '…', '擤', '\t', '玕', ''])

    def add_error(orig_char, corr_char, pos):
        if orig_char not in unk_tokens and corr_char not in unk_tokens:
            errors.append((orig_char, corr_char, pos))

    matcher = difflib.SequenceMatcher(None, origin_text, corrected_text)

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            continue

        origin_part = origin_text[i1:i2]
        corrected_part = corrected_text[j1:j2]

        min_len = min(len(origin_part), len(corrected_part))

        for idx in range(min_len):
            add_error(origin_part[idx], corrected_part[idx], i1 + idx)

        for idx in range(min_len, len(origin_part)):
            add_error(origin_part[idx], '', i1 + idx)

        insert_pos = i1 + len(origin_part) if tag == 'replace' else i1
        for idx in range(min_len, len(corrected_part)):
            add_error('', corrected_part[idx], insert_pos)

    return sorted(errors, key=lambda x: x[2])

`lazyllm.tools.review.tools.chinese_corrector.ChineseCorrector`

Chinese text corrector that uses large language models to correct grammar and spelling errors in Chinese sentences.

Can correct single sentences or batches of sentences by configuring different language models, and returns correction results with error details.

Parameters:

llm (Optional[LLMBase], default: None ) –

Optional, large language model instance. Uses default model if None.
base_url (str, default: None ) –

Optional, base URL for model service.
model (str, default: None ) –

Optional, model name to use.
api_key (str, default: 'null' ) –

Optional, API key, defaults to 'null'.
source (str, default: 'openai' ) –

Model source, defaults to 'openai'.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.review.tools.chinese_corrector import ChineseCorrector
>>> corrector = ChineseCorrector()
>>> result = corrector.correct("我喜欢编程成")
>>> print(result)
{'source': '我喜欢编程成', 'target': '我喜欢编程', 'errors': [('成', '', 6)]}
>>>
>>> results = corrector.correct_batch(["句子1", "句子2"])
>>> print(results)
[{'source': '句子1', 'target': '修正后句子1', 'errors': [...]}, ...]

Source code in lazyllm/tools/review/tools/chinese_corrector.py

class ChineseCorrector:
    """Chinese text corrector that uses large language models to correct grammar and spelling errors in Chinese sentences.

Can correct single sentences or batches of sentences by configuring different language models,
and returns correction results with error details.

Args:
    llm: Optional, large language model instance. Uses default model if None.
    base_url (str): Optional, base URL for model service.
    model (str): Optional, model name to use.
    api_key (str): Optional, API key, defaults to 'null'.
    source (str): Model source, defaults to 'openai'.


Examples:
        >>> import lazyllm
        >>> from lazyllm.tools.review.tools.chinese_corrector import ChineseCorrector
        >>> corrector = ChineseCorrector()
        >>> result = corrector.correct("我喜欢编程成")
        >>> print(result)
        {'source': '我喜欢编程成', 'target': '我喜欢编程', 'errors': [('成', '', 6)]}
        >>>
        >>> results = corrector.correct_batch(["句子1", "句子2"])
        >>> print(results)
        [{'source': '句子1', 'target': '修正后句子1', 'errors': [...]}, ...]
    """
    def __init__(self, llm: Optional[LLMBase] = None, base_url: Optional[str] = None,
                 model: Optional[str] = None, api_key: Optional[str] = 'null',
                 source: str = 'openai', **_: Any):
        if llm:
            base_llm = llm
        else:
            base_llm = AutoModel(source=source, model=model)
        self.base_llm = base_llm.prompt(lazyllm.AlpacaPrompter(DEFAULT_INSTRUCTION))

    def _predict(self, sentences: List[str], max_tokens: Optional[int] = None,
                 temperature: Optional[float] = None, **kwargs) -> List[Dict[str, Any]]:
        if not sentences:
            return []

        llm_kwargs = {
            'max_tokens': max_tokens or DEFAULT_MAX_TOKENS,
            'temperature': temperature if temperature is not None else DEFAULT_TEMPERATURE
        }

        llm_kwargs.update(kwargs)

        results: List[Dict[str, Any]] = []
        for sentence in sentences:
            try:
                response = self.base_llm(
                    dict(sentence=sentence),
                    stream_output=False,
                    **llm_kwargs,
                )
                response = self._post_process(response, sentence)
                errors = get_errors(response, sentence)
            except Exception as e:
                lazyllm.LOG.error(
                    f'Error predicting sentence {sentence[:50]}{"..." if len(sentence) > 50 else ""}. '
                    f'with max_tokens: {max_tokens}, temperature: {temperature}'
                    f'Error: {e}'
                )
                response = ''
                errors = []
            results.append(
                {
                    'source': sentence,
                    'target': response,
                    'errors': errors,
                }
            )

        return results

    def correct(self, sentence: str, **kwargs) -> Dict[str, Any]:
        """Correct grammar and spelling errors in a single Chinese sentence.

Uses the configured language model to correct the input sentence and returns a dictionary
containing the original text, corrected text, and error details.

Args:
    sentence (str): The Chinese sentence to correct.
    **kwargs: Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:
    dict: Dictionary containing the following keys:
        - source (str): The original input sentence.
        - target (str): The corrected sentence.
        - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).
"""
        results = self._predict([sentence], **kwargs)
        return results[0] if results else {'source': sentence, 'target': sentence, 'errors': []}

    def correct_batch(self, sentences: List[str], batch_size: int = DEFAULT_BATCH_SIZE,
                      concurrency: Optional[int] = 2, **kwargs) -> List[Dict[str, Any]]:
        """Batch correct grammar and spelling errors in multiple Chinese sentences.

Uses parallel processing to correct multiple sentences efficiently. Returns a list of dictionaries
containing correction results for each sentence.

Args:
    sentences (list): List of Chinese sentences to correct.
    batch_size (int): Optional, batch size, defaults to 4.
    concurrency (int): Optional, concurrency level, defaults to 2.
    **kwargs: Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:
    list: List of dictionaries, each containing correction results with keys:
        - source (str): The original input sentence.
        - target (str): The corrected sentence.
        - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).
"""
        if not sentences:
            return []

        def process_sentence(sent: str) -> Dict[str, Any]:
            try:
                res = self._predict([sent], **kwargs)
                return res[0] if res else {'source': sent, 'target': sent, 'errors': []}
            except Exception as e:
                lazyllm.LOG.error(f'Error processing sentence: {e}')
                return {'source': sent, 'target': sent, 'errors': []}

        try:
            results_package = warp(process_sentence, _concurrent=concurrency)(package(sentences))
            results = list(results_package)
            return results
        except Exception as e:
            lazyllm.LOG.error(f'Error in warp processing: {e}')
            return [{'source': sent, 'target': sent, 'errors': []} for sent in sentences]

    def _post_process(self, response: str, origin: str) -> str:
        response = response.strip()
        match = re.search(r'</think\s*>(.*)', response, re.DOTALL)
        if match:
            response = match.group(1).strip()
        else:
            response = re.sub(r'^<think>.*?</think>', '', response, flags=re.DOTALL).strip()

        sentence_endings = ['。', '！', '？', '；', '：', '，', '、', '.', ',', '?', '!', ':']
        origin_ending = origin[-1] if origin[-1] in sentence_endings else None
        response_ending = response[-1] if response[-1] in sentence_endings else None
        if origin_ending and not response_ending:
            response += origin_ending
        elif not origin_ending and response_ending:
            response = response[:-1]
        return response

`correct(sentence, **kwargs)`

Correct grammar and spelling errors in a single Chinese sentence.

Uses the configured language model to correct the input sentence and returns a dictionary containing the original text, corrected text, and error details.

Parameters:

sentence (str) –

The Chinese sentence to correct.
**kwargs –

Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:

dict ( Dict[str, Any] ) –

Dictionary containing the following keys: - source (str): The original input sentence. - target (str): The corrected sentence. - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).

Source code in lazyllm/tools/review/tools/chinese_corrector.py

    def correct(self, sentence: str, **kwargs) -> Dict[str, Any]:
        """Correct grammar and spelling errors in a single Chinese sentence.

Uses the configured language model to correct the input sentence and returns a dictionary
containing the original text, corrected text, and error details.

Args:
    sentence (str): The Chinese sentence to correct.
    **kwargs: Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:
    dict: Dictionary containing the following keys:
        - source (str): The original input sentence.
        - target (str): The corrected sentence.
        - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).
"""
        results = self._predict([sentence], **kwargs)
        return results[0] if results else {'source': sentence, 'target': sentence, 'errors': []}

`correct_batch(sentences, batch_size=DEFAULT_BATCH_SIZE, concurrency=2, **kwargs)`

Batch correct grammar and spelling errors in multiple Chinese sentences.

Uses parallel processing to correct multiple sentences efficiently. Returns a list of dictionaries containing correction results for each sentence.

Parameters:

sentences (list) –

List of Chinese sentences to correct.
batch_size (int, default: DEFAULT_BATCH_SIZE ) –

Optional, batch size, defaults to 4.
concurrency (int, default: 2 ) –

Optional, concurrency level, defaults to 2.
**kwargs –

Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:

list ( List[Dict[str, Any]] ) –

List of dictionaries, each containing correction results with keys: - source (str): The original input sentence. - target (str): The corrected sentence. - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).

Source code in lazyllm/tools/review/tools/chinese_corrector.py

    def correct_batch(self, sentences: List[str], batch_size: int = DEFAULT_BATCH_SIZE,
                      concurrency: Optional[int] = 2, **kwargs) -> List[Dict[str, Any]]:
        """Batch correct grammar and spelling errors in multiple Chinese sentences.

Uses parallel processing to correct multiple sentences efficiently. Returns a list of dictionaries
containing correction results for each sentence.

Args:
    sentences (list): List of Chinese sentences to correct.
    batch_size (int): Optional, batch size, defaults to 4.
    concurrency (int): Optional, concurrency level, defaults to 2.
    **kwargs: Additional parameters passed to the language model, such as max_tokens, temperature, etc.

Returns:
    list: List of dictionaries, each containing correction results with keys:
        - source (str): The original input sentence.
        - target (str): The corrected sentence.
        - errors (list): List of errors, each element is a tuple (orig_char, corr_char, pos).
"""
        if not sentences:
            return []

        def process_sentence(sent: str) -> Dict[str, Any]:
            try:
                res = self._predict([sent], **kwargs)
                return res[0] if res else {'source': sent, 'target': sent, 'errors': []}
            except Exception as e:
                lazyllm.LOG.error(f'Error processing sentence: {e}')
                return {'source': sent, 'target': sent, 'errors': []}

        try:
            results_package = warp(process_sentence, _concurrent=concurrency)(package(sentences))
            results = list(results_package)
            return results
        except Exception as e:
            lazyllm.LOG.error(f'Error in warp processing: {e}')
            return [{'source': sent, 'target': sent, 'errors': []} for sent in sentences]

`lazyllm.tools.rag.QueryEnhACProcessor`

Query synonym expansion using an Aho–Corasick (AC) automaton.

Matches vocabulary words in the query string, then replaces kept hits with the original word plus other surface forms in the same cluster (first hit per cluster is expanded; later hits in the same cluster keep only the matched word). Candidates are pre-filtered to longest non-overlapping matches, then a discriminator decides true word boundaries. If discriminator is None but the automaton still matches, a warning is logged and the query is unchanged; if the discriminator fails after retries, enhancement is skipped and the original query is kept.

Parameters:

data_source –

Vocabulary source: a no-arg callable returning list[dict], or a list of dicts. Each record must contain the fields named by cluster_key and word_key; incomplete rows are skipped.
discriminator –

Boundary discriminator: OnlineChatModule, TrainableModule for LLM (with prompt/formatter chain), or a started TrainableModule with deploy_method(lazyllm.deploy.BertDeploy) (sequence classification, same semantics as ac-jieba). None disables enhancement.
cluster_key (str, default: 'cluster_id' ) –

Field name for synonym cluster id, default "cluster_id".
word_key (str, default: 'word' ) –

Field name for surface word text, default "word".
max_retries (int, default: 3 ) –

Max retries for batched LLM calls or per-match BERT calls, default 3.
prompt_lang (Literal['zh', 'en'], default: 'zh' ) –

Language for built-in LLM prompts; ignored when discriminator is None or a BERT-deployed TrainableModule.

Methods:

- ``__call__ –

Enhance a single string or a list of strings.
- ``get_matches –

Return matches after AC + boundary filtering.
- ``update_data_source`` / ``update_discriminator`` –

Hot-swap vocabulary or discriminator.

Examples:

>>> import lazyllm
>>> from lazyllm.tools.rag import QueryEnhACProcessor
>>> def vocab():
...     return [
...         {"cluster_id": "C1", "word": "民法"},
...         {"cluster_id": "C1", "word": "civil law"},
...     ]
>>> # LLM discriminator
>>> model = lazyllm.OnlineChatModule()
>>> proc = QueryEnhACProcessor(data_source=vocab, discriminator=model)
>>> out = proc("什么是民法？")
>>> print(out)
>>> # BERT discriminator
>>> bert_m = lazyllm.TrainableModule('your-seq-cls-model', use_model_map=False).deploy_method(
...     lazyllm.deploy.BertDeploy, max_length=128,
... ).start()
>>> proc2 = QueryEnhACProcessor(data_source=vocab, discriminator=bert_m)
>>> out2 = proc2("什么是民法？")
>>> print(out2)
>>> proc.update_data_source(vocab)
>>> proc.update_discriminator(model)

Source code in lazyllm/tools/rag/query_enh_ac.py

class QueryEnhACProcessor:
    """Query synonym expansion using an Aho–Corasick (AC) automaton.

Matches vocabulary words in the query string, then replaces kept hits with the original word plus other surface forms in the same cluster (first hit per cluster is expanded; later hits in the same cluster keep only the matched word). Candidates are pre-filtered to **longest non-overlapping** matches, then a discriminator decides true word boundaries. If ``discriminator`` is ``None`` but the automaton still matches, a warning is logged and the query is unchanged; if the discriminator fails after retries, enhancement is skipped and the original query is kept.

Args:
    data_source: Vocabulary source: a no-arg callable returning ``list[dict]``, or a list of dicts. Each record must contain the fields named by ``cluster_key`` and ``word_key``; incomplete rows are skipped.
    discriminator: Boundary discriminator: ``OnlineChatModule``, ``TrainableModule`` for LLM (with prompt/formatter chain), or a started ``TrainableModule`` with ``deploy_method(lazyllm.deploy.BertDeploy)`` (sequence classification, same semantics as ac-jieba). ``None`` disables enhancement.
    cluster_key (str): Field name for synonym cluster id, default ``"cluster_id"``.
    word_key (str): Field name for surface word text, default ``"word"``.
    max_retries (int): Max retries for batched LLM calls or per-match BERT calls, default ``3``.
    prompt_lang (Literal["zh", "en"]): Language for built-in LLM prompts; ignored when ``discriminator`` is ``None`` or a BERT-deployed ``TrainableModule``.

Methods:
    - ``__call__(queries)``: Enhance a single string or a list of strings.
    - ``get_matches(query)``: Return matches after AC + boundary filtering.
    - ``update_data_source`` / ``update_discriminator``: Hot-swap vocabulary or discriminator.


Examples:
    >>> import lazyllm
    >>> from lazyllm.tools.rag import QueryEnhACProcessor
    >>> def vocab():
    ...     return [
    ...         {"cluster_id": "C1", "word": "民法"},
    ...         {"cluster_id": "C1", "word": "civil law"},
    ...     ]
    >>> # LLM discriminator
    >>> model = lazyllm.OnlineChatModule()
    >>> proc = QueryEnhACProcessor(data_source=vocab, discriminator=model)
    >>> out = proc("什么是民法？")
    >>> print(out)
    >>> # BERT discriminator
    >>> bert_m = lazyllm.TrainableModule('your-seq-cls-model', use_model_map=False).deploy_method(
    ...     lazyllm.deploy.BertDeploy, max_length=128,
    ... ).start()
    >>> proc2 = QueryEnhACProcessor(data_source=vocab, discriminator=bert_m)
    >>> out2 = proc2("什么是民法？")
    >>> print(out2)
    >>> proc.update_data_source(vocab)
    >>> proc.update_discriminator(model)
    """

    def __init__(
        self,
        data_source=None,
        discriminator=None,
        cluster_key: str = 'cluster_id',
        word_key: str = 'word',
        max_retries: int = 3,
        prompt_lang: PromptLang = 'zh',
        threshold: float = 0.5,
    ):
        if prompt_lang not in ('zh', 'en'):
            raise ValueError(f'prompt_lang must be "zh" or "en", got {prompt_lang!r}')
        self.cluster_key = cluster_key
        self.word_key = word_key
        self._max_retries = max_retries
        self._prompt_lang: PromptLang = prompt_lang
        self._threshold = float(threshold)

        self._boundary_filter: Optional[Union[_LLMFilter, _BERTFilter]] = None
        self._build_filter(discriminator)

        self.vocab_data = []
        self.word_to_cluster = {}
        self.cluster_to_words = {}
        self.automaton = None

        self._data_source = data_source if data_source is not None else []
        self._rebuild_automaton()

    def _build_filter(self, discriminator):
        if discriminator is None:
            self._boundary_filter = None
            return
        if isinstance(discriminator, TrainableModule) and _is_bert_deploy(discriminator):
            self._boundary_filter = _BERTFilter(
                model=discriminator,
                threshold=self._threshold,
                max_retries=self._max_retries,
            )
            return
        if isinstance(discriminator, LLMBase):
            self._boundary_filter = _LLMFilter(
                model=discriminator,
                max_retries=self._max_retries,
                prompt_lang=self._prompt_lang,
            )
            return
        raise TypeError(
            f'Unsupported discriminator type {type(discriminator)}. '
            'Use OnlineChatModule, TrainableModule (LLM or lazyllm.deploy.BertDeploy), or None.'
        )

    def _rebuild_automaton(self):
        if callable(self._data_source):
            self.vocab_data = self._data_source()
        else:
            self.vocab_data = list(self._data_source)

        self.word_to_cluster = {}
        cluster_word_sets: dict = {}

        for item in self.vocab_data:
            cluster_id = item.get(self.cluster_key)
            word = item.get(self.word_key)
            if cluster_id is None or word is None:
                continue
            self.word_to_cluster[word] = cluster_id
            if cluster_id not in cluster_word_sets:
                cluster_word_sets[cluster_id] = set()
            cluster_word_sets[cluster_id].add(word)

        self.cluster_to_words = {k: list(v) for k, v in cluster_word_sets.items()}

        if self.word_to_cluster:
            automaton = ahocorasick.Automaton()
            for word, cluster_id in self.word_to_cluster.items():
                automaton.add_word(str(word), (cluster_id, str(word)))
            automaton.make_automaton()
            self.automaton = automaton
            LOG.info(f'AC automaton built, vocabulary size: {len(self.word_to_cluster)}')
        else:
            self.automaton = None
            LOG.debug('AC automaton: vocabulary is empty, automaton not built')

    def update_data_source(self, data_source):
        """Replace the vocabulary source and rebuild the AC automaton.

Args:
    data_source: New source, same rules as the constructor ``data_source`` argument.
"""
        self._data_source = data_source
        self._rebuild_automaton()

    def update_discriminator(self, discriminator):
        """Hot-swap the boundary discriminator (rebuilds the internal LLM / BERT filter from the new instance).

Args:
    discriminator: New discriminator; same supported types as the constructor, or ``None``.
"""
        self._build_filter(discriminator)

    def _get_matches(self, query: str) -> list:
        if not self.automaton or not query:
            return []

        # Internal match dict schema (fixed keys, independent of user-facing word_key/cluster_key):
        # {'word': str, 'cluster_id': str, 'start': int, 'end': int}
        raw_matches = []
        for end_idx, (cluster_id, matched_word) in self.automaton.iter(str(query)):
            start_idx = end_idx - len(matched_word) + 1
            raw_matches.append({
                'word': matched_word,
                'cluster_id': cluster_id,
                'start': start_idx,
                'end': end_idx,
            })

        if not raw_matches:
            return []

        if self._boundary_filter is None:
            LOG.warning(
                'QueryEnhACProcessor: discriminator is None but AC automaton had matches; '
                'skipping enhancement (original query unchanged).'
            )
            return []

        raw_matches.sort(key=lambda x: (x['start'], -len(x['word'])))
        result, last_end = [], -1
        for m in raw_matches:
            if m['start'] <= last_end:
                continue
            result.append(m)
            last_end = m['end']
        return self._boundary_filter(query, result)

    def _enhance_single(self, query: str) -> str:
        matches = self._get_matches(query)
        if not matches:
            return query

        enhanced_parts = []
        last_pos = 0
        seen_clusters: set = set()

        for match in matches:
            start_idx = match['start']
            end_idx = match['end']

            if start_idx > last_pos:
                enhanced_parts.append(query[last_pos:start_idx])

            cluster_id = match['cluster_id']
            if cluster_id in seen_clusters:
                replacement = match['word']
            else:
                seen_clusters.add(cluster_id)
                cluster_words = self.cluster_to_words.get(cluster_id, [])
                other_words = [w for w in cluster_words if w != match['word']]
                replacement = (
                    f'{match["word"]}（{", ".join(other_words)}）'
                    if other_words else match['word']
                )

            enhanced_parts.append(replacement)
            last_pos = end_idx + 1

        if last_pos < len(query):
            enhanced_parts.append(query[last_pos:])

        return ''.join(enhanced_parts)

    def get_matches(self, query: str) -> List[dict]:
        """Return matches after AC matching and boundary filtering (shape aligned with ac-jieba ``return_matches_only``).

Args:
    query (str): The query string to match against the AC automaton.

**Returns:**
- ``List[dict]``: Each item contains the configured ``word_key`` and ``cluster_key`` fields, plus ``cluster_words`` (all words in that cluster).
"""
        out: List[dict] = []
        for m in self._get_matches(query):
            cid = m['cluster_id']
            out.append({
                self.word_key: m['word'],
                self.cluster_key: cid,
                'cluster_words': self.cluster_to_words.get(cid, []),
            })
        return out

    def __call__(self, queries: Union[str, List[str]]) -> Union[str, List[str]]:
        """Run synonym expansion on one or more queries.

Args:
    queries (Union[str, List[str]]): A single query string or a list of query strings.

**Returns:**
- ``Union[str, List[str]]``: Same structure as input—a string for a single query, or a list of enhanced strings.
"""
        if isinstance(queries, str):
            return self._enhance_single(queries)
        if isinstance(queries, list):
            for i, q in enumerate(queries):
                if not isinstance(q, str):
                    raise TypeError(
                        f'QueryEnhACProcessor: queries[{i}] must be str, got {type(q).__name__}'
                    )
            return [self._enhance_single(q) for q in queries]
        raise TypeError(
            f'QueryEnhACProcessor: queries must be str or List[str], got {type(queries).__name__}'
        )

`call(queries)`

Run synonym expansion on one or more queries.

Parameters:

queries (Union[str, List[str]]) –

A single query string or a list of query strings.

Returns: - Union[str, List[str]]: Same structure as input—a string for a single query, or a list of enhanced strings.

Source code in lazyllm/tools/rag/query_enh_ac.py

    def __call__(self, queries: Union[str, List[str]]) -> Union[str, List[str]]:
        """Run synonym expansion on one or more queries.

Args:
    queries (Union[str, List[str]]): A single query string or a list of query strings.

**Returns:**
- ``Union[str, List[str]]``: Same structure as input—a string for a single query, or a list of enhanced strings.
"""
        if isinstance(queries, str):
            return self._enhance_single(queries)
        if isinstance(queries, list):
            for i, q in enumerate(queries):
                if not isinstance(q, str):
                    raise TypeError(
                        f'QueryEnhACProcessor: queries[{i}] must be str, got {type(q).__name__}'
                    )
            return [self._enhance_single(q) for q in queries]
        raise TypeError(
            f'QueryEnhACProcessor: queries must be str or List[str], got {type(queries).__name__}'
        )

`get_matches(query)`

Return matches after AC matching and boundary filtering (shape aligned with ac-jieba return_matches_only).

Parameters:

query (str) –

The query string to match against the AC automaton.

Returns: - List[dict]: Each item contains the configured word_key and cluster_key fields, plus cluster_words (all words in that cluster).

Source code in lazyllm/tools/rag/query_enh_ac.py

    def get_matches(self, query: str) -> List[dict]:
        """Return matches after AC matching and boundary filtering (shape aligned with ac-jieba ``return_matches_only``).

Args:
    query (str): The query string to match against the AC automaton.

**Returns:**
- ``List[dict]``: Each item contains the configured ``word_key`` and ``cluster_key`` fields, plus ``cluster_words`` (all words in that cluster).
"""
        out: List[dict] = []
        for m in self._get_matches(query):
            cid = m['cluster_id']
            out.append({
                self.word_key: m['word'],
                self.cluster_key: cid,
                'cluster_words': self.cluster_to_words.get(cid, []),
            })
        return out

`update_data_source(data_source)`

Replace the vocabulary source and rebuild the AC automaton.

Parameters:

data_source –

New source, same rules as the constructor data_source argument.

Source code in lazyllm/tools/rag/query_enh_ac.py

    def update_data_source(self, data_source):
        """Replace the vocabulary source and rebuild the AC automaton.

Args:
    data_source: New source, same rules as the constructor ``data_source`` argument.
"""
        self._data_source = data_source
        self._rebuild_automaton()

`update_discriminator(discriminator)`

Hot-swap the boundary discriminator (rebuilds the internal LLM / BERT filter from the new instance).

Parameters:

discriminator –

New discriminator; same supported types as the constructor, or None.

Source code in lazyllm/tools/rag/query_enh_ac.py

    def update_discriminator(self, discriminator):
        """Hot-swap the boundary discriminator (rebuilds the internal LLM / BERT filter from the new instance).

Args:
    discriminator: New discriminator; same supported types as the constructor, or ``None``.
"""
        self._build_filter(discriminator)

Tools

lazyllm.tools.agent.code_interpreter

code_interpreter(code, language='python')

lazyllm.tools.sandbox.LazyLLMSandboxBase

forward(code, language='python', input_files=None, output_files=None)

lazyllm.tools.sandbox.DummySandbox

lazyllm.tools.sandbox.SandboxFusion

lazyllm.tools.IntentClassifier

intent_promt_hook(input=None, history=[], tools=None, label=None)

post_process_result(input)

lazyllm.tools.Document

activate_group(group_name, embed_keys=None, enable_embed=True)

activate_groups(groups, **kwargs)

add_reader(pattern, func=None)

analyze_schema_by_llm(kb_id=None, doc_ids=None)

clear_cache(group_names=None)

connect_sql_manager(sql_manager, schma=None, force_refresh=True)

create_kb_group(name, doc_fields=None, store_conf=None)

create_node_group(name=None, *, transform, parent=LAZY_ROOT_NAME, trans_node=None, num_workers=0, display_name=None, ref=None, group_type=NodeGroupType.CHUNK, lazy_mode=None, **kwargs)

drop_algorithm()

extract_db_schema(llm=None, print_schema=False)

find(target)

find_children(target)

find_parent(target)

get_embed()

get_nodes(uids=None, doc_ids=None, group=None, kb_id=None, numbers=None, limit=None, offset=0, return_total=False, sort_by_number=False)

get_sql_manager()

get_store()

get_window_nodes(node, span=(-5, 5), merge=False)

list_all_files_in_directory(dataset_path, skip_hidden_path=True, recursive=True) staticmethod

register_global_reader(pattern, func=None) classmethod

register_index(index_type, index_cls, *args, **kwargs)

register_schema_set(schema_set, kb_id=DEFAULT_KB_ID, force_refresh=False)

update_database(llm=None)

lazyllm.tools.rag.store.ChromaStore

dir property

connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)

delete(collection_name, criteria=None, **kwargs)

get(collection_name, criteria=None, **kwargs)

search(collection_name, query_embedding, embed_key, topk=10, filters=None, **kwargs)

upsert(collection_name, data)

lazyllm.tools.rag.store.MilvusStore

dir property

connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)

delete(collection_name, criteria=None, **kwargs)

get(collection_name, criteria=None, **kwargs)

search(collection_name, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)

upsert(collection_name, data)

validate_milvus_embed_keys(index_kwargs)

lazyllm.tools.rag.store.hybrid.hybrid_store.HybridStore

connect(*args, **kwargs)

delete(collection_name, criteria=None, **kwargs)

drop_collection(collection_name)

get(collection_name, criteria=None, **kwargs)

search(collection_name, query, query_embedding=None, topk=10, filters=None, embed_key=None, **kwargs)

seg_connect(*args, **kwargs)

upsert(collection_name, data)

vec_connect(*args, **kwargs)

lazyllm.tools.rag.store.hybrid.oceanbase_store.OceanBaseStore

connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)

delete(collection_name, criteria=None, **kwargs)

get(collection_name, criteria=None, **kwargs)

search(collection_name, query, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)

upsert(collection_name, data, range_part=None, **kwargs)

lazyllm.tools.rag.store.ElasticSearchStore

dir property

connect(global_metadata_desc=None, **kwargs)

delete(collection_name=None, criteria=None, **kwargs)

get(collection_name, criteria=None, **kwargs)

search(collection_name, query, topk=10, filters=None, **kwargs)

upsert(collection_name=None, data=None)

lazyllm.tools.rag.readers.ReaderBase

clear_encoding_cache() classmethod

detect_encoding(file_path, fs=None, sample_size=10000, use_cache=True, enable_chardet=True) classmethod

get_encoding_cache_stats() classmethod

lazyllm.tools.rag.readers.PandasCSVReader

lazyllm.tools.rag.readers.PandasExcelReader

lazyllm.tools.rag.readers.PDFReader

lazyllm.tools.rag.readers.PPTXReader

lazyllm.tools.rag.readers.VideoAudioReader

`lazyllm.tools.agent.code_interpreter`

`code_interpreter(code, language='python')`

`lazyllm.tools.sandbox.LazyLLMSandboxBase`

`forward(code, language='python', input_files=None, output_files=None)`

`lazyllm.tools.sandbox.DummySandbox`

`lazyllm.tools.sandbox.SandboxFusion`

`lazyllm.tools.IntentClassifier`

`intent_promt_hook(input=None, history=[], tools=None, label=None)`

`post_process_result(input)`

`lazyllm.tools.Document`

`activate_group(group_name, embed_keys=None, enable_embed=True)`

`activate_groups(groups, **kwargs)`

`add_reader(pattern, func=None)`

`analyze_schema_by_llm(kb_id=None, doc_ids=None)`

`clear_cache(group_names=None)`

`connect_sql_manager(sql_manager, schma=None, force_refresh=True)`

`create_kb_group(name, doc_fields=None, store_conf=None)`

`create_node_group(name=None, *, transform, parent=LAZY_ROOT_NAME, trans_node=None, num_workers=0, display_name=None, ref=None, group_type=NodeGroupType.CHUNK, lazy_mode=None, **kwargs)`

`drop_algorithm()`

`extract_db_schema(llm=None, print_schema=False)`

`find(target)`

`find_children(target)`

`find_parent(target)`

`get_embed()`

`get_nodes(uids=None, doc_ids=None, group=None, kb_id=None, numbers=None, limit=None, offset=0, return_total=False, sort_by_number=False)`

`get_sql_manager()`

`get_store()`

`get_window_nodes(node, span=(-5, 5), merge=False)`

`list_all_files_in_directory(dataset_path, skip_hidden_path=True, recursive=True)` `staticmethod`

`register_global_reader(pattern, func=None)` `classmethod`

`register_index(index_type, index_cls, *args, **kwargs)`

`register_schema_set(schema_set, kb_id=DEFAULT_KB_ID, force_refresh=False)`

`update_database(llm=None)`

`lazyllm.tools.rag.store.ChromaStore`

`dir` `property`

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

`delete(collection_name, criteria=None, **kwargs)`

`get(collection_name, criteria=None, **kwargs)`

`search(collection_name, query_embedding, embed_key, topk=10, filters=None, **kwargs)`

`upsert(collection_name, data)`

`lazyllm.tools.rag.store.MilvusStore`

`dir` `property`

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

`delete(collection_name, criteria=None, **kwargs)`

`get(collection_name, criteria=None, **kwargs)`

`search(collection_name, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)`

`upsert(collection_name, data)`

`validate_milvus_embed_keys(index_kwargs)`

`lazyllm.tools.rag.store.hybrid.hybrid_store.HybridStore`

`connect(*args, **kwargs)`

`delete(collection_name, criteria=None, **kwargs)`

`drop_collection(collection_name)`

`get(collection_name, criteria=None, **kwargs)`

`search(collection_name, query, query_embedding=None, topk=10, filters=None, embed_key=None, **kwargs)`

`seg_connect(*args, **kwargs)`

`upsert(collection_name, data)`

`vec_connect(*args, **kwargs)`

`lazyllm.tools.rag.store.hybrid.oceanbase_store.OceanBaseStore`

`connect(embed_dims=None, embed_datatypes=None, embed=None, global_metadata_desc=None, **kwargs)`

`delete(collection_name, criteria=None, **kwargs)`

`get(collection_name, criteria=None, **kwargs)`

`search(collection_name, query, query_embedding, topk, filters=None, embed_key=None, filter_str='', **kwargs)`

`upsert(collection_name, data, range_part=None, **kwargs)`

`lazyllm.tools.rag.store.ElasticSearchStore`

`dir` `property`

`connect(global_metadata_desc=None, **kwargs)`

`delete(collection_name=None, criteria=None, **kwargs)`

`get(collection_name, criteria=None, **kwargs)`

`search(collection_name, query, topk=10, filters=None, **kwargs)`

`upsert(collection_name=None, data=None)`

`lazyllm.tools.rag.readers.ReaderBase`

`clear_encoding_cache()` `classmethod`

`detect_encoding(file_path, fs=None, sample_size=10000, use_cache=True, enable_chardet=True)` `classmethod`

`get_encoding_cache_stats()` `classmethod`

`lazyllm.tools.rag.readers.PandasCSVReader`

`lazyllm.tools.rag.readers.PandasExcelReader`

`lazyllm.tools.rag.readers.PDFReader`

`lazyllm.tools.rag.readers.PPTXReader`

`lazyllm.tools.rag.readers.VideoAudioReader`

`lazyllm.tools.SqlManager`