第2章 Anthropic Computer Use API 详解 — Computer Use 多模态 Agent 教程

支持的模型与版本

Computer Use 的模型要求

Computer Use 功能不是所有 Claude 模型都支持，需要使用特定版本：

claude-3-5-sonnet-20241022

Computer Use 的首发版本，2024 年 10 月发布。这是目前最稳定、官方文档最完整的版本。推荐用于生产环境。

claude-3-7-sonnet-20250219

2025 年发布的新版本，Computer Use 能力进一步增强，对复杂 UI 的识别更准确。推荐用于新项目开发。

claude-opus-4-20250514

Claude 4 系列旗舰版，Computer Use 能力最强，特别适合需要复杂推理的多步骤任务。成本较高。

三大工具定义详解

computer_20241022：核心交互工具

import anthropic

client = anthropic.Anthropic()

# computer 工具定义
computer_tool = {
    "type": "computer_20241022",
    "name": "computer",
    "display_width_px": 1920,  # 屏幕宽度（像素）
    "display_height_px": 1080, # 屏幕高度（像素）
    "display_number": 1,        # 显示器编号（多显示器时使用）
}

# computer 工具支持的 action 类型
"""
screenshot         - 截取当前屏幕，无参数
cursor_position    - 获取当前鼠标位置，无参数
mouse_move         - 移动鼠标，需要 coordinate: [x, y]
left_click         - 左键单击，需要 coordinate: [x, y]
right_click        - 右键单击，需要 coordinate: [x, y]
middle_click       - 中键单击，需要 coordinate: [x, y]
double_click       - 双击，需要 coordinate: [x, y]
left_click_drag    - 左键拖拽，需要 start_coordinate 和 coordinate
key                - 按键（支持组合键），需要 text（如 "ctrl+c"）
type               - 输入文本，需要 text
scroll             - 滚动，需要 coordinate 和 direction/amount
"""

text_editor_20241022：文件编辑工具

# text_editor 工具定义
text_editor_tool = {
    "type": "text_editor_20241022",
    "name": "str_replace_editor"
}

# 支持的命令
"""
view        - 查看文件内容，参数: path, [view_range]
create      - 创建新文件，参数: path, file_text
str_replace - 替换文本，参数: path, old_str, new_str
insert      - 在指定行后插入，参数: path, insert_line, new_str
undo_edit   - 撤销上次编辑，参数: path
"""

# 示例：Claude 使用 text_editor 修改配置文件
"""
{
    "type": "tool_use",
    "name": "str_replace_editor",
    "input": {
        "command": "str_replace",
        "path": "/etc/nginx/nginx.conf",
        "old_str": "listen 80;",
        "new_str": "listen 443 ssl;"
    }
}
"""

bash_20241022：Shell 工具

# bash 工具定义
bash_tool = {
    "type": "bash_20241022",
    "name": "bash"
}

# 示例：Claude 使用 bash 执行命令
"""
{
    "type": "tool_use",
    "name": "bash",
    "input": {
        "command": "ls -la /home/user/documents | head -20",
        "restart": false  // 是否重启 shell 会话
    }
}
"""

完整 API 调用示例

第一次调用：获取截图

import anthropic
import base64
from screenshot_tool import take_screenshot  # 第3章实现

client = anthropic.Anthropic()

# 定义所有工具
tools = [
    {"type": "computer_20241022", "name": "computer",
     "display_width_px": 1920, "display_height_px": 1080},
    {"type": "text_editor_20241022", "name": "str_replace_editor"},
    {"type": "bash_20241022", "name": "bash"},
]

# 初始消息（只包含用户任务，不包含截图）
messages = [{
    "role": "user",
    "content": "请打开终端并创建一个名为 hello.py 的文件，内容是 print('Hello, World!')"
}]

response = client.beta.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=4096,
    tools=tools,
    messages=messages,
    betas=["computer-use-2024-10-22"],  # 必须包含 beta header
    system="""你是一个能控制计算机的 AI Agent。
你的操作将在真实的计算机上执行，请谨慎操作。
每次操作前先截图确认当前状态。"""
)

解析 tool_use 响应

import json

def parse_response(response):
    """解析 Claude 的响应，提取工具调用"""
    tool_calls = []
    text_content = []

    for block in response.content:
        if block.type == "text":
            text_content.append(block.text)
        elif block.type == "tool_use":
            tool_calls.append({
                "id": block.id,
                "name": block.name,
                "input": block.input
            })

    return tool_calls, text_content

# 示例响应结构
"""
response.content = [
    TextBlock(type="text", text="我需要先截图查看当前屏幕状态"),
    ToolUseBlock(
        type="tool_use",
        id="toolu_01XxXxXxXxXxXxXxXxXxXxXx",
        name="computer",
        input={
            "action": "screenshot"
        }
    )
]
response.stop_reason = "tool_use"  # 表示 Claude 正在等待工具结果
"""

工具结果返回（tool_result）

构建 tool_result 消息

import base64


def execute_tool(tool_call: dict) -> dict:
    """执行工具调用，返回结果"""
    tool_name = tool_call["name"]
    tool_input = tool_call["input"]

    if tool_name == "computer":
        action = tool_input["action"]

        if action == "screenshot":
            # 截取屏幕截图并编码为 base64
            screenshot_bytes = take_screenshot()
            screenshot_b64 = base64.standard_b64encode(screenshot_bytes).decode()
            return {
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": screenshot_b64
                }
            }
        elif action == "left_click":
            x, y = tool_input["coordinate"]
            click_mouse(x, y)
            return {"type": "text", "text": f"Clicked at ({x}, {y})"}
        elif action == "type":
            text = tool_input["text"]
            type_text(text)
            return {"type": "text", "text": f"Typed: {text}"}
        # ... 其他 action 处理

    elif tool_name == "bash":
        command = tool_input["command"]
        result = run_bash(command)
        return {"type": "text", "text": result.stdout + result.stderr}


def build_tool_result_message(tool_calls: list, assistant_content: list) -> dict:
    """构建包含工具结果的消息"""
    tool_results = []
    for tool_call in tool_calls:
        result = execute_tool(tool_call)
        tool_results.append({
            "type": "tool_result",
            "tool_use_id": tool_call["id"],
            "content": [result]  # 注意：content 是列表
        })

    return {
        "role": "user",
        "content": tool_results
    }

完整执行循环

Agent 主循环实现

async def run_computer_use_agent(task: str, max_steps: int = 50) -> str:
    """
    运行 Computer Use Agent 直到任务完成。

    Args:
        task: 用户任务描述
        max_steps: 最大步骤数（防止无限循环）

    Returns:
        任务完成后的最终文本报告
    """
    messages = [{"role": "user", "content": task}]
    steps = 0

    while steps < max_steps:
        steps += 1

        # 调用 Claude API
        response = client.beta.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4096,
            tools=tools,
            messages=messages,
            betas=["computer-use-2024-10-22"],
        )

        # 将 Claude 的响应加入消息历史
        messages.append({
            "role": "assistant",
            "content": response.content
        })

        # 检查是否完成（end_turn 表示 Claude 认为任务完成）
        if response.stop_reason == "end_turn":
            # 提取最后的文本响应
            for block in response.content:
                if block.type == "text":
                    return block.text
            return "任务已完成"

        # 提取工具调用
        tool_calls = [b for b in response.content if b.type == "tool_use"]
        if not tool_calls:
            break

        # 执行工具并构建结果消息
        tool_results = []
        for tool_call in tool_calls:
            result = execute_tool({
                "name": tool_call.name,
                "input": tool_call.input
            })
            tool_results.append({
                "type": "tool_result",
                "tool_use_id": tool_call.id,
                "content": [result]
            })

        # 将工具结果加入消息历史
        messages.append({"role": "user", "content": tool_results})

    return f"已执行 {steps} 步，任务可能未完成"

Token 消耗分析

Computer Use 的 Token 特点

Computer Use 任务的 Token 消耗比普通对话高得多，主要原因是截图：

截图 Token 成本

一张 1920×1080 的截图，在发送给 Claude 前会被缩放处理，但仍然消耗约 1000-1200 tokens。如果每步都截图，执行 50 步的任务可能消耗 5-6 万 tokens 仅在截图上。

消息历史累积

Claude 需要看到完整的对话历史（包括所有截图）来保持上下文，导致 token 消耗随步骤数线性增长。执行 50 步任务可能总消耗 20-30 万 tokens。

成本估算

以 claude-3-5-sonnet 的定价（$3/百万 input tokens，$15/百万 output tokens），一个 50 步的中等复杂任务约花费 $0.5-2 美元。

成本优化提示

可以通过以下方式降低 token 成本：缩小截图分辨率（960×540 通常足够）、只在必要时截图（不是每步都截）、压缩图像质量（JPEG 而非 PNG）、定期清理消息历史（只保留最近 N 轮）。这些优化在第3章和第10章会详细介绍。

API 请求结构完整解析

beta 请求头的必要性

Computer Use 是 Anthropic 的 beta 功能，必须在请求中显式声明。不同工具版本对应不同的 beta 标识符：

computer-use-2024-10-22

对应工具类型 computer_20241022、text_editor_20241022、bash_20241022。这是 2024 年 10 月首发版本，也是目前（2025年）最稳定的版本。Python SDK 中通过 betas=["computer-use-2024-10-22"] 传入。

computer-use-2025-01-24

2025 年 1 月发布的更新版 beta，对应新版工具类型后缀（_20250124）。引入了改进的坐标精度和更好的滚动操作支持。新项目推荐使用此版本。

HTTP 请求头（直接 API 调用）

如果不使用 Python SDK，需要在 HTTP 请求头中添加：anthropic-beta: computer-use-2024-10-22。可以同时指定多个 beta 功能，用逗号分隔。

import anthropic

client = anthropic.Anthropic()

# 使用 Python SDK 的完整请求示例
response = client.beta.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=4096,
    # betas 参数声明使用哪些 beta 功能
    betas=["computer-use-2024-10-22"],
    # tools 包含 Computer Use 内置工具定义
    tools=[
        {"type": "computer_20241022", "name": "computer",
         "display_width_px": 1280, "display_height_px": 800},
    ],
    # system 提供 Agent 行为指导
    system="""你是一个计算机操作助手。
每次操作前先截图了解当前屏幕状态。
操作后再截图确认结果。
遇到不确定的情况，停止并询问用户。""",
    # messages 包含对话历史（包括之前的截图和工具结果）
    messages=[{
        "role": "user",
        "content": "请打开 Chrome 浏览器并导航到 google.com"
    }]
)

理解消息格式的层次结构

Computer Use 的对话历史比普通 API 调用复杂，包含文本块、图像块、工具调用块和工具结果块的混合。理解这个结构对于构建可靠的 Agent 至关重要：

"""
完整的消息历史结构示意（第3轮对话）：

messages = [
  # 第1轮：用户发起任务
  {
    "role": "user",
    "content": "打开 Chrome 并导航到 google.com"
  },

  # 第1轮：Claude 回复（包含文本 + 工具调用）
  {
    "role": "assistant",
    "content": [
      {
        "type": "text",
        "text": "好的，我先截图了解当前屏幕状态。"
      },
      {
        "type": "tool_use",
        "id": "toolu_01AAAA",
        "name": "computer",
        "input": {"action": "screenshot"}
      }
    ]
  },

  # 第2轮：工具结果（截图）作为 user 消息返回
  {
    "role": "user",
    "content": [
      {
        "type": "tool_result",
        "tool_use_id": "toolu_01AAAA",
        "content": [
          {
            "type": "image",
            "source": {
              "type": "base64",
              "media_type": "image/jpeg",
              "data": "/9j/4AAQSkZJRg..."  # 截图的 base64 数据
            }
          }
        ]
      }
    ]
  },

  # 第3轮：Claude 分析截图，执行下一步操作
  {
    "role": "assistant",
    "content": [
      {
        "type": "text",
        "text": "我看到桌面，需要找到 Chrome 浏览器。我看到 Dock 中有 Chrome 图标。"
      },
      {
        "type": "tool_use",
        "id": "toolu_01BBBB",
        "name": "computer",
        "input": {
          "action": "double_click",
          "coordinate": [940, 1050]  # Chrome 图标位置
        }
      }
    ]
  }
]
"""

stop_reason 的含义

Claude 的响应会包含 stop_reason，指示为什么停止生成。这是判断 Agent 下一步操作的关键信号：

stop_reason = "tool_use"

Claude 正在等待工具执行结果。你需要执行响应中的工具调用，然后将结果作为 tool_result 消息返回，再次调用 API。这是执行循环中最常见的状态。

stop_reason = "end_turn"

Claude 认为任务已完成，不需要更多工具调用。此时提取最后的文本内容作为最终结果，结束循环。

stop_reason = "max_tokens"

达到 max_tokens 限制，Claude 被强制中断。应该增大 max_tokens 或检查是否有不必要的冗余内容占用了 token 空间。

stop_reason = "stop_sequence"

遇到了预设的停止序列。在 Computer Use 场景中较少使用，但可以用来设计可中断的 Agent。

常见 API 错误与处理

import anthropic
from anthropic import APIError, APIConnectionError, RateLimitError
import time


def call_api_with_retry(client, **kwargs):
    """带重试逻辑的 API 调用"""
    max_retries = 3
    for attempt in range(max_retries):
        try:
            return client.beta.messages.create(**kwargs)

        except RateLimitError:
            # 速率限制：等待后重试（指数退避）
            wait = 2 ** attempt
            print(f"速率限制，{wait}秒后重试...")
            time.sleep(wait)

        except APIConnectionError:
            # 连接失败：检查网络，然后重试
            print(f"连接失败（第 {attempt+1} 次）")
            time.sleep(1)

        except APIError as e:
            if e.status_code == 400:
                # 请求格式错误：通常是消息结构问题，不要重试
                print(f"请求错误（400）: {e.message}")
                raise
            elif e.status_code == 529:
                # 服务过载：等待更长时间
                time.sleep(10 * (attempt + 1))
            else:
                raise

    raise RuntimeError(f"API 调用失败，已重试 {max_retries} 次")

常见错误：tool_result 格式错误

在 Computer Use 中，最常见的 API 错误（400）是 tool_result 的 content 格式不正确。截图结果的 content 必须是一个列表（[{"type": "image", ...}]），而不是直接的图像对象。文本结果的 content 可以是字符串或列表。这个差异很容易导致调试困难。

章节小结

本章深入讲解了 Computer Use API 的完整技术细节。核心要点：

三大工具（computer / text_editor / bash）有版本对应关系，必须搭配正确的 beta 标识符
消息历史是文本块、工具调用块、工具结果块的混合结构，理解此结构是构建 Agent 的基础
stop_reason = "tool_use" 表示需要执行工具并返回结果，end_turn 表示任务完成
截图是最大的 token 消耗源，每张约 600-1500 token；合理控制截图频率可显著降低成本
API 错误应区分可重试（429、529）和不可重试（400）两类，设计对应的重试逻辑

← 上一章多模态 AI 与 Computer Use 概述下一章 → 截图工具与视觉理解