Source code for langcheck.augment.en._jailbreak_template

from __future__ import annotations

from .._common._jailbreak_template import jailbreak_template_common

AVAILABLE_JAILBREAK_TEMPLATES = [
    # langcheck/metrics/prompts/en/jailbreak_templates/basic.j2
    # Basic "Ignore the instruction" prompt
    "basic",
    # langcheck/metrics/prompts/en/jailbreak_templates/chatgpt_good_vs_evil.j2
    # Prompt that asks chatgpt to get into "Do Anything Now" mode
    "chatgpt_dan",
    # langcheck/metrics/prompts/en/jailbreak_templates/john.j2
    # Prompt that asks ChatGPT to generate both good and evil outputs
    "chatgpt_good_vs_evil",
    # langcheck/metrics/prompts/en/jailbreak_templates/john.j2
    # Prompt that asks the LLM to act as a virtual assistant "John"
    "john",
    # langcheck/metrics/prompts/en/jailbreak_templates/universal_adversarial_suffix.j2
    # Prompt with the suffix reported by https://arxiv.org/abs/2307.15043
    "universal_adversarial_suffix",
]


[docs] def jailbreak_template( instances: list[str] | str, templates: list[str] | None = None, *, num_perturbations: int = 1, randomize_order: bool = True, seed: int | None = None, ) -> list[str]: """Applies jailbreak templates to each string in instances. Args: instances: A single string or a list of strings to be augmented. templates: A list templates to apply. If None, some templates are randomly selected and used. Available templates are: - basic - chatgpt_dan - chatgpt_good_vs_evil - john - universal_adversarial_suffix num_perturbations: The number of perturbed instances to generate for each string in instances. Should be equal to or less than the number of templates. randomize_order: If True, the order of the templates is randomized. When turned off, num_perturbations needs to be equal to the number of templates. seed: The seed for the random number generator. You can fix the seed to deterministically select the same templates. Returns: A list of perturbed instances. """ return jailbreak_template_common( instances, templates, AVAILABLE_JAILBREAK_TEMPLATES, "en", num_perturbations=num_perturbations, randomize_order=randomize_order, seed=seed, )