使用多步骤提示进行单元测试编写

复杂的任务，比如编写单元测试，可以从多步骤提示中受益。与单个提示不同，多步骤提示从GPT生成文本，然后将输出文本反馈给后续提示。如果您希望GPT在回答问题之前进行推理，或者在执行计划之前进行头脑风暴，这可以帮助您。

我们使用一个3步提示符，使用以下步骤在Python中编写单元测试:

解释:
给定一个Python函数，我们要求GPT解释该函数在做什么以及为什么。
计划:
我们要求GPT为功能计划一组单元测试。如果计划太短，我们要求GPT详细阐述单元测试的更多想法。
执行:
最后，我们指示GPT编写涵盖计划用例的单元测试。

代码示例演示了对链式多步骤提示符的一些修饰:

条件分支(例如，只有当第一个计划太短时才要求详细说明)
针对不同的步骤选择不同的模型
如果输出不令人满意(例如，如果输出代码不能被Python的ast模块解析)，则重新运行函数的检查
流式输出，以便您可以在输出完全生成之前开始读取输出(对于长、多步骤输出很方便)

# imports needed to run the code in this notebook
import ast  # used for detecting whether generated Python code is valid
import openai  # used for calling the OpenAI API

color_prefix_by_role = {
    "system": "\033[0m",  # gray
    "user": "\033[0m",  # gray
    "assistant": "\033[92m",  # green
}


def print_messages(messages, color_prefix_by_role=color_prefix_by_role) -> None:
    """Prints messages sent to or from GPT."""
    for message in messages:
        role = message["role"]
        color_prefix = color_prefix_by_role[role]
        content = message["content"]
        print(f"{color_prefix}\n[{role}]\n{content}")


def print_message_delta(delta, color_prefix_by_role=color_prefix_by_role) -> None:
    """Prints a chunk of messages streamed back from GPT."""
    if "role" in delta:
        role = delta["role"]
        color_prefix = color_prefix_by_role[role]
        print(f"{color_prefix}\n[{role}]\n", end="")
    elif "content" in delta:
        content = delta["content"]
        print(content, end="")
    else:
        pass


# example of a function that uses a multi-step prompt to write unit tests
def unit_tests_from_function(
    function_to_test: str,  # Python function to test, as a string
    unit_test_package: str = "pytest",  # unit testing package; use the name as it appears in the import statement
    approx_min_cases_to_cover: int = 7,  # minimum number of test case categories to cover (approximate)
    print_text: bool = False,  # optionally prints text; helpful for understanding the function & debugging
    explain_model: str = "gpt-3.5-turbo",  # model used to generate text plans in step 1
    plan_model: str = "gpt-3.5-turbo",  # model used to generate text plans in steps 2 and 2b
    execute_model: str = "gpt-3.5-turbo",  # model used to generate code in step 3
    temperature: float = 0.4,  # temperature = 0 can sometimes get stuck in repetitive loops, so we use 0.4
    reruns_if_fail: int = 1,  # if the output code cannot be parsed, this will re-run the function up to N times
) -> str:
    """Returns a unit test for a given Python function, using a 3-step GPT prompt."""

    # Step 1: Generate an explanation of the function

    # create a markdown-formatted message that asks GPT to explain the function, formatted as a bullet list
    explain_system_message = {
        "role": "system",
        "content": "You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists.",
    }
    explain_user_message = {
        "role": "user",
        "content": f"""Please explain the following Python function. Review what each element of the function is doing precisely and what the author's intentions may have been. Organize your explanation as a markdown-formatted, bulleted list.

```python
{function_to_test}
```""",
    }
    explain_messages = [explain_system_message, explain_user_message]
    if print_text:
        print_messages(explain_messages)

    explanation_response = openai.ChatCompletion.create(
        model=explain_model,
        messages=explain_messages,
        temperature=temperature,
        stream=True,
    )
    explanation = ""
    for chunk in explanation_response:
        delta = chunk["choices"][0]["delta"]
        if print_text:
            print_message_delta(delta)
        if "content" in delta:
            explanation += delta["content"]
    explain_assistant_message = {"role": "assistant", "content": explanation}

    # Step 2: Generate a plan to write a unit test

    # Asks GPT to plan out cases the units tests should cover, formatted as a bullet list
    plan_user_message = {
        "role": "user",
        "content": f"""A good unit test suite should aim to:
- Test the function's behavior for a wide range of possible inputs
- Test edge cases that the author may not have foreseen
- Take advantage of the features of `{unit_test_package}` to make the tests easy to write and maintain
- Be easy to read and understand, with clean code and descriptive names
- Be deterministic, so that the tests always pass or fail in the same way

To help unit test the function above, list diverse scenarios that the function should be able to handle (and under each scenario, include a few examples as sub-bullets).""",
    }
    plan_messages = [
        explain_system_message,
        explain_user_message,
        explain_assistant_message,
        plan_user_message,
    ]
    if print_text:
        print_messages([plan_user_message])
    plan_response = openai.ChatCompletion.create(
        model=plan_model,
        messages=plan_messages,
        temperature=temperature,
        stream=True,
    )
    plan = ""
    for chunk in plan_response:
        delta = chunk["choices"][0]["delta"]
        if print_text:
            print_message_delta(delta)
        if "content" in delta:
            plan += delta["content"]
    plan_assistant_message = {"role": "assistant", "content": plan}

    # Step 2b: If the plan is short, ask GPT to elaborate further
    # this counts top-level bullets (e.g., categories), but not sub-bullets (e.g., test cases)
    num_bullets = max(plan.count("\n-"), plan.count("\n*"))
    elaboration_needed = num_bullets < approx_min_cases_to_cover
    if elaboration_needed:
        elaboration_user_message = {
            "role": "user",
            "content": f"""In addition to those scenarios above, list a few rare or unexpected edge cases (and as before, under each edge case, include a few examples as sub-bullets).""",
        }
        elaboration_messages = [
            explain_system_message,
            explain_user_message,
            explain_assistant_message,
            plan_user_message,
            plan_assistant_message,
            elaboration_user_message,
        ]
        if print_text:
            print_messages([elaboration_user_message])
        elaboration_response = openai.ChatCompletion.create(
            model=plan_model,
            messages=elaboration_messages,
            temperature=temperature,
            stream=True,
        )
        elaboration = ""
        for chunk in elaboration_response:
            delta = chunk["choices"][0]["delta"]
            if print_text:
                print_message_delta(delta)
            if "content" in delta:
                elaboration += delta["content"]
        elaboration_assistant_message = {"role": "assistant", "content": elaboration}

    # Step 3: Generate the unit test

    # create a markdown-formatted prompt that asks GPT to complete a unit test
    package_comment = ""
    if unit_test_package == "pytest":
        package_comment = "# below, each test case is represented by a tuple passed to the @pytest.mark.parametrize decorator"
    execute_system_message = {
        "role": "system",
        "content": "You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You write careful, accurate unit tests. When asked to reply only with code, you write all of your code in a single block.",
    }
    execute_user_message = {
        "role": "user",
        "content": f"""Using Python and the `{unit_test_package}` package, write a suite of unit tests for the function, following the cases above. Include helpful comments to explain each line. Reply only with code, formatted as follows:

```python
# imports
import {unit_test_package}  # used for our unit tests
{{insert other imports as needed}}

# function to test
{function_to_test}

# unit tests
{package_comment}
{{insert unit test code here}}
```""",
    }
    execute_messages = [
        execute_system_message,
        explain_user_message,
        explain_assistant_message,
        plan_user_message,
        plan_assistant_message,
    ]
    if elaboration_needed:
        execute_messages += [elaboration_user_message, elaboration_assistant_message]
    execute_messages += [execute_user_message]
    if print_text:
        print_messages([execute_system_message, execute_user_message])

    execute_response = openai.ChatCompletion.create(
        model=execute_model,
        messages=execute_messages,
        temperature=temperature,
        stream=True,
    )
    execution = ""
    for chunk in execute_response:
        delta = chunk["choices"][0]["delta"]
        if print_text:
            print_message_delta(delta)
        if "content" in delta:
            execution += delta["content"]

    # check the output for errors
    code = execution.split("```python")[1].split("```")[0].strip()
    try:
        ast.parse(code)
    except SyntaxError as e:
        print(f"Syntax error in generated code: {e}")
        if reruns_if_fail > 0:
            print("Rerunning...")
            return unit_tests_from_function(
                function_to_test=function_to_test,
                unit_test_package=unit_test_package,
                approx_min_cases_to_cover=approx_min_cases_to_cover,
                print_text=print_text,
                explain_model=explain_model,
                plan_model=plan_model,
                execute_model=execute_model,
                temperature=temperature,
                reruns_if_fail=reruns_if_fail
                - 1,  # decrement rerun counter when calling again
            )

    # return the unit test as a string
    return code

example_function = """def pig_latin(text):
    def translate(word):
        vowels = 'aeiou'
        if word[0] in vowels:
            return word + 'way'
        else:
            consonants = ''
            for letter in word:
                if letter not in vowels:
                    consonants += letter
                else:
                    break
            return word[len(consonants):] + consonants + 'ay'

    words = text.lower().split()
    translated_words = [translate(word) for word in words]
    return ' '.join(translated_words)
"""

unit_tests = unit_tests_from_function(
    example_function,
    approx_min_cases_to_cover=10,
    print_text=True
)

[system]
You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists.

[user]
Please explain the following Python function. Review what each element of the function is doing precisely and what the author's intentions may have been. Organize your explanation as a markdown-formatted, bulleted list.

def pig_latin(text):
    def translate(word):
        vowels = 'aeiou'
        if word[0] in vowels:
            return word + 'way'
        else:
            consonants = ''
            for letter in word:
                if letter not in vowels:
                    consonants += letter
                else:
                    break
            return word[len(consonants):] + consonants + 'ay'

    words = text.lower().split()
    translated_words = [translate(word) for word in words]
    return ' '.join(translated_words)

[assistant]
The `pig_latin` function takes a string of text and returns the text translated into pig latin. Here's how it works:

* The function defines a nested function called `translate` that takes a single word as input and returns the word translated into pig latin.
* The `translate` function first defines a string of vowels.
* If the first letter of the input word is a vowel, the function adds "way" to the end of the word and returns the result.
* If the first letter of the input word is a consonant, the function loops through the word's letters until it finds a vowel.
* The function then takes the consonants at the beginning of the word and moves them to the end of the word, adding "ay" to the end of the word.
* The `pig_latin` function lowercases the input text and splits it into a list of words.
* The function then applies the `translate` function to each word in the list using a list comprehension.
* Finally, the function joins the translated words back together into a single string with spaces between each word and returns the result.
[user]
A good unit test suite should aim to:
- Test the function's behavior for a wide range of possible inputs
- Test edge cases that the author may not have foreseen
- Take advantage of the features of `pytest` to make the tests easy to write and maintain
- Be easy to read and understand, with clean code and descriptive names
- Be deterministic, so that the tests always pass or fail in the same way

To help unit test the function above, list diverse scenarios that the function should be able to handle (and under each scenario, include a few examples as sub-bullets).

[assistant]
Here are some scenarios that the `pig_latin` function should be able to handle, along with examples:

* Words that start with a vowel:
  * "apple" -> "appleway"
  * "elephant" -> "elephantway"
* Words that start with a single consonant:
  * "pig" -> "igpay"
  * "latin" -> "atinlay"
* Words that start with multiple consonants:
  * "string" -> "ingstray"
  * "glove" -> "oveglay"
* Words that contain numbers or special characters:
  * "hello!" -> "ellohay!"
  * "world123" -> "orldway123"
* Sentences with multiple words:
  * "hello world" -> "ellohay orldway"
  * "the quick brown fox" -> "hetay ickquay ownbray oxfay"
* Sentences with punctuation:
  * "Hello, world!" -> "Ellohay, orldway!"
  * "The quick brown fox..." -> "Hetay ickquay ownbray oxfay..." 
* Empty strings:
  * "" -> ""
[user]
In addition to those scenarios above, list a few rare or unexpected edge cases (and as before, under each edge case, include a few examples as sub-bullets).

[assistant]
Here are some rare or unexpected edge cases that the `pig_latin` function should be able to handle, along with examples:

* Words that consist entirely of consonants:
  * "xyz" -> "xyzay"
  * "rhythm" -> "ythmrhay"
* Words that consist entirely of vowels:
  * "aeiou" -> "aeiouway"
  * "ouiea" -> "ouieaway"
* Words that contain mixed case:
  * "PyThOn" -> "ythonpay"
  * "eLePhAnT" -> "elephantway"
* Strings that contain only whitespace:
  * "   " -> ""
* Non-string inputs:
  * None -> TypeError
  * 42 -> AttributeError
[system]
You are a world-class Python developer with an eagle eye for unintended bugs and edge cases. You write careful, accurate unit tests. When asked to reply only with code, you write all of your code in a single block.

[user]
Using Python and the `pytest` package, write a suite of unit tests for the function, following the cases above. Include helpful comments to explain each line. Reply only with code, formatted as follows:

# imports
import pytest  # used for our unit tests
{insert other imports as needed}

# function to test
def pig_latin(text):
    def translate(word):
        vowels = 'aeiou'
        if word[0] in vowels:
            return word + 'way'
        else:
            consonants = ''
            for letter in word:
                if letter not in vowels:
                    consonants += letter
                else:
                    break
            return word[len(consonants):] + consonants + 'ay'

    words = text.lower().split()
    translated_words = [translate(word) for word in words]
    return ' '.join(translated_words)


# unit tests
# below, each test case is represented by a tuple passed to the @pytest.mark.parametrize decorator
{insert unit test code here}

[assistant]

# imports
import pytest

# function to test
def pig_latin(text):
    def translate(word):
        vowels = 'aeiou'
        if word[0] in vowels:
            return word + 'way'
        else:
            consonants = ''
            for letter in word:
                if letter not in vowels:
                    consonants += letter
                else:
                    break
            return word[len(consonants):] + consonants + 'ay'

    words = text.lower().split()
    translated_words = [translate(word) for word in words]
    return ' '.join(translated_words)


# unit tests
# below, each test case is represented by a tuple passed to the @pytest.mark.parametrize decorator

# Tests for normal cases
@pytest.mark.parametrize('text, expected', [
    ('apple', 'appleway'),
    ('elephant', 'elephantway'),
    ('pig', 'igpay'),
    ('latin', 'atinlay'),
    ('string', 'ingstray'),
    ('glove', 'oveglay'),
    ('hello world', 'ellohay orldway'),
    ('the quick brown fox', 'hetay ickquay ownbray oxfay'),
    ('Hello, world!', 'Ellohay, orldway!'),
    ('The quick brown fox...', 'Hetay ickquay ownbray oxfay...'),
    ('', ''),
])

def test_pig_latin_normal_cases(text, expected):
    assert pig_latin(text) == expected


# Tests for edge cases
@pytest.mark.parametrize('text, expected', [
    ('xyz', 'xyzay'),
    ('rhythm', 'ythmrhay'),
    ('aeiou', 'aeiouway'),
    ('ouiea', 'ouieaway'),
    ('PyThOn', 'ythonpay'),
    ('eLePhAnT', 'elephantway'),
    ('   ', ''),
    (None, TypeError),
    (42, AttributeError)
])

def test_pig_latin_edge_cases(text, expected):
    if type(expected) == type:
        with pytest.raises(expected):
            pig_latin(text)
    else:
        assert pig_latin(text) == expected

# imports
import pytest

# function to test
def pig_latin(text):
    def translate(word):
        vowels = 'aeiou'
        if word[0] in vowels:
            return word + 'way'
        else:
            consonants = ''
            for letter in word:
                if letter not in vowels:
                    consonants += letter
                else:
                    break
            return word[len(consonants):] + consonants + 'ay'

    words = text.lower().split()
    translated_words = [translate(word) for word in words]
    return ' '.join(translated_words)


# unit tests
# below, each test case is represented by a tuple passed to the @pytest.mark.parametrize decorator

# Tests for normal cases
@pytest.mark.parametrize('text, expected', [
    ('apple', 'appleway'),
    ('elephant', 'elephantway'),
    ('pig', 'igpay'),
    ('latin', 'atinlay'),
    ('string', 'ingstray'),
    ('glove', 'oveglay'),
    ('hello world', 'ellohay orldway'),
    ('the quick brown fox', 'hetay ickquay ownbray oxfay'),
    ('Hello, world!', 'Ellohay, orldway!'),
    ('The quick brown fox...', 'Hetay ickquay ownbray oxfay...'),
    ('', ''),
])

def test_pig_latin_normal_cases(text, expected):
    assert pig_latin(text) == expected


# Tests for edge cases
@pytest.mark.parametrize('text, expected', [
    ('xyz', 'xyzay'),
    ('rhythm', 'ythmrhay'),
    ('aeiou', 'aeiouway'),
    ('ouiea', 'ouieaway'),
    ('PyThOn', 'ythonpay'),
    ('eLePhAnT', 'elephantway'),
    ('   ', ''),
    (None, TypeError),
    (42, AttributeError)
])

def test_pig_latin_edge_cases(text, expected):
    if type(expected) == type:
        with pytest.raises(expected):
            pig_latin(text)
    else:
        assert pig_latin(text) == expected

在使用代码之前一定要检查它，因为GPT会犯很多错误(特别是在像这样的基于字符的任务中)。为了获得最佳效果，请使用最强大的模型(截至2023年5月的GPT-4)。