From fb1d9a7c60b2f55566ac2e5feffd98d589c8b99e Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:36:40 +0800 Subject: [PATCH 1/8] Create V1 --- V1 | 1 + 1 file changed, 1 insertion(+) create mode 100644 V1 diff --git a/V1 b/V1 new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/V1 @@ -0,0 +1 @@ + From 547a78f769fdb9e910f66e32c9575a1b76b07c30 Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:43:23 +0800 Subject: [PATCH 2/8] Delete V1 --- V1 | 1 - 1 file changed, 1 deletion(-) delete mode 100644 V1 diff --git a/V1 b/V1 deleted file mode 100644 index 8b137891..00000000 --- a/V1 +++ /dev/null @@ -1 +0,0 @@ - From 208fd6382ac26621dd8629a767bf273c678158e3 Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:56:16 +0800 Subject: [PATCH 3/8] Create V1 --- V1 | 1 + 1 file changed, 1 insertion(+) create mode 100644 V1 diff --git a/V1 b/V1 new file mode 100644 index 00000000..c5012691 --- /dev/null +++ b/V1 @@ -0,0 +1 @@ +rebuild_v1/README.md From be06a5bc61aa5414e0f4caa7707cffbf8805ba4f Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:58:51 +0800 Subject: [PATCH 4/8] Create README.md --- rebuild_v1/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 rebuild_v1/README.md diff --git a/rebuild_v1/README.md b/rebuild_v1/README.md new file mode 100644 index 00000000..c1e3b52e --- /dev/null +++ b/rebuild_v1/README.md @@ -0,0 +1 @@ +HI From 025c310012762b1b2b1b5f46c5665ad4d60e5db4 Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 17:59:13 +0800 Subject: [PATCH 5/8] Delete V1 --- V1 | 1 - 1 file changed, 1 deletion(-) delete mode 100644 V1 diff --git a/V1 b/V1 deleted file mode 100644 index c5012691..00000000 --- a/V1 +++ /dev/null @@ -1 +0,0 @@ -rebuild_v1/README.md From 7ec7ef371faf03e6042da1eda72befaca60515bd Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Mon, 15 Dec 2025 18:31:37 +0800 Subject: [PATCH 6/8] Add clean rebuild for self-operating computer v1 --- rebuild_v1/.env.example | 2 + rebuild_v1/.gitignore | 5 + rebuild_v1/README.md | 49 ++++++++- rebuild_v1/automation.py | 133 ++++++++++++++++++++++ rebuild_v1/config.py | 39 +++++++ rebuild_v1/engine_llm.py | 151 +++++++++++++++++++++++++ rebuild_v1/gui.py | 214 ++++++++++++++++++++++++++++++++++++ rebuild_v1/main.py | 67 +++++++++++ rebuild_v1/requirements.txt | 6 + 9 files changed, 665 insertions(+), 1 deletion(-) create mode 100644 rebuild_v1/.env.example create mode 100644 rebuild_v1/.gitignore create mode 100644 rebuild_v1/automation.py create mode 100644 rebuild_v1/config.py create mode 100644 rebuild_v1/engine_llm.py create mode 100644 rebuild_v1/gui.py create mode 100644 rebuild_v1/main.py create mode 100644 rebuild_v1/requirements.txt diff --git a/rebuild_v1/.env.example b/rebuild_v1/.env.example new file mode 100644 index 00000000..38354282 --- /dev/null +++ b/rebuild_v1/.env.example @@ -0,0 +1,2 @@ +# Optional: set your OpenRouter API key for Phase B +OPENROUTER_API_KEY= diff --git a/rebuild_v1/.gitignore b/rebuild_v1/.gitignore new file mode 100644 index 00000000..404642ea --- /dev/null +++ b/rebuild_v1/.gitignore @@ -0,0 +1,5 @@ +config.json +.env +.venv +__pycache__/ +*.pyc diff --git a/rebuild_v1/README.md b/rebuild_v1/README.md index c1e3b52e..fbc3c200 100644 --- a/rebuild_v1/README.md +++ b/rebuild_v1/README.md @@ -1 +1,48 @@ -HI +# Self-Operating Computer v1 (clean rebuild) + +A minimal Windows-friendly rewrite that runs a decide-and-act loop with a local Ollama model (no API keys required). Phase B wiring for OpenRouter is also in place. + +## Features +- Tkinter GUI with Tasks, Settings, and Logs tabs +- Dry Run safety (on by default) and STOP hotkey/button +- Loop: screenshot → LLM JSON action → validate → (optionally) execute via `pyautogui` +- Config persisted to `rebuild_v1/config.json` + +## Requirements +- Python 3.11 on Windows +- [Ollama](https://ollama.com) running locally (default host `http://localhost:11434`) +- Optional: OpenRouter API key for Phase B + +Install Python packages: + +```bash +python -m venv .venv +.venv\\Scripts\\activate +pip install -r rebuild_v1/requirements.txt +``` + +## Running the app +```bash +python -m rebuild_v1.main +``` +If Tkinter fails to start in a headless environment, run on a local desktop session. + +## Using Ollama (Phase A) +1. Start Ollama: `ollama serve` +2. Pull the default model once: `ollama pull llama3.2:3b` +3. Ensure the host and model fields in **Settings** match your setup. + +The app checks connectivity to Ollama and shows a friendly message if the server or model is unavailable. + +## Safety controls +- **Dry Run**: enabled by default; shows actions without executing. +- **STOP hotkey**: `Ctrl+Alt+S` (configurable) registered via the `keyboard` library. +- **STOP button**: halts the current loop. +- **Max steps** and **delay** configurable in Settings. + +## OpenRouter (Phase B) +A provider dropdown exists; if "OpenRouter (API)" is chosen, set your API key, model, and base URL in Settings. Requests use the OpenAI-compatible SDK. + +## Notes +- Screenshots use Pillow's `ImageGrab`; ensure a visible desktop session. +- Avoid sharing `config.json`—it is gitignored and may contain sensitive keys. diff --git a/rebuild_v1/automation.py b/rebuild_v1/automation.py new file mode 100644 index 00000000..c9a5aaa3 --- /dev/null +++ b/rebuild_v1/automation.py @@ -0,0 +1,133 @@ +import threading +import time +from dataclasses import dataclass +from typing import Callable, Dict, Optional + +import keyboard +import pyautogui +from PIL import ImageGrab + +pyautogui.FAILSAFE = False + + +@dataclass +class ActionResult: + action: Dict[str, object] + executed: bool + error: Optional[str] = None + + +class AutomationEngine: + def __init__(self, dry_run: bool = True, stop_hotkey: str = "ctrl+alt+s"): + self.dry_run = dry_run + self.stop_hotkey = stop_hotkey + self._stop_flag = threading.Event() + self._hotkey_registered = False + + def register_stop_hotkey(self) -> None: + if self._hotkey_registered: + return + try: + keyboard.add_hotkey(self.stop_hotkey, self.stop) + self._hotkey_registered = True + except keyboard.KeyboardException: + # Keyboard may need elevated privileges; ignore if unavailable + pass + + def stop(self) -> None: + self._stop_flag.set() + + def reset_stop(self) -> None: + self._stop_flag.clear() + + def should_stop(self) -> bool: + return self._stop_flag.is_set() + + def capture_screenshot(self, save_path: Optional[str] = None): + image = ImageGrab.grab() + if save_path: + image.save(save_path) + return image + + def execute_action(self, action: Dict[str, object]) -> ActionResult: + if self.dry_run: + return ActionResult(action=action, executed=False, error=None) + + try: + action_type = action.get("type") + if action_type == "click": + pyautogui.click(x=int(action["x"]), y=int(action["y"])) + elif action_type == "type": + pyautogui.typewrite(str(action["text"])) + elif action_type == "hotkey": + keys = [str(k) for k in action.get("keys", [])] + pyautogui.hotkey(*keys) + elif action_type == "wait": + time.sleep(float(action.get("seconds", 0))) + elif action_type == "done": + # No-op + pass + else: + return ActionResult(action=action, executed=False, error="Unknown action type") + return ActionResult(action=action, executed=True, error=None) + except Exception as exc: # pylint: disable=broad-except + return ActionResult(action=action, executed=False, error=str(exc)) + + +class ActionLooper: + def __init__( + self, + automation: AutomationEngine, + request_action: Callable[[str, str], str], + parse_action: Callable[[str], Optional[Dict[str, object]]], + max_steps: int, + delay_seconds: float, + log_callback: Callable[[str], None], + ): + self.automation = automation + self.request_action = request_action + self.parse_action = parse_action + self.max_steps = max_steps + self.delay_seconds = delay_seconds + self.log_callback = log_callback + + def run(self, objective: str) -> str: + self.automation.reset_stop() + self.automation.register_stop_hotkey() + last_note = "Screenshot captured" + for step in range(1, self.max_steps + 1): + if self.automation.should_stop(): + return "Stopped by user" + try: + self.automation.capture_screenshot() + except Exception as exc: # pylint: disable=broad-except + self.log_callback(f"Screenshot failed: {exc}") + last_note = "screenshot failed" + else: + last_note = "screenshot taken" + + retries = 0 + action_data = None + raw = "" + while retries < 3 and action_data is None: + raw = self.request_action(objective, last_note) + self.log_callback(f"Raw model response: {raw}") + action_data = self.parse_action(raw) + if action_data is None: + retries += 1 + self.log_callback("Model returned invalid JSON. Retrying...") + if action_data is None: + return "Failed to parse action after retries" + + result = self.automation.execute_action(action_data) + executed_text = "executed" if result.executed else "dry-run" + self.log_callback(f"Action step {step}: {action_data} ({executed_text})") + if result.error: + self.log_callback(f"Action error: {result.error}") + if action_data.get("type") == "done": + return str(action_data.get("reason", "Done")) + + if self.automation.should_stop(): + return "Stopped by user" + time.sleep(self.delay_seconds) + return "Reached max steps" diff --git a/rebuild_v1/config.py b/rebuild_v1/config.py new file mode 100644 index 00000000..5fc071a7 --- /dev/null +++ b/rebuild_v1/config.py @@ -0,0 +1,39 @@ +import json +from pathlib import Path +from typing import Any, Dict + +CONFIG_PATH = Path(__file__).parent / "config.json" + +DEFAULT_CONFIG: Dict[str, Any] = { + "provider": "Ollama (Local)", + "ollama_host": "http://localhost:11434", + "ollama_model": "llama3.2:3b", + "openrouter_api_key": "", + "openrouter_model": "openrouter/auto", + "openrouter_base_url": "https://openrouter.ai/api/v1", + "max_steps": 10, + "delay_seconds": 0.6, + "stop_hotkey": "ctrl+alt+s", + "dry_run": True, +} + + +def load_config() -> Dict[str, Any]: + if CONFIG_PATH.exists(): + try: + with CONFIG_PATH.open("r", encoding="utf-8") as f: + data = json.load(f) + merged = {**DEFAULT_CONFIG, **data} + return merged + except (json.JSONDecodeError, OSError): + return DEFAULT_CONFIG.copy() + return DEFAULT_CONFIG.copy() + + +def save_config(config: Dict[str, Any]) -> None: + try: + with CONFIG_PATH.open("w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + except OSError: + # Prefer silent failure over crashing the UI when filesystem is read-only + pass diff --git a/rebuild_v1/engine_llm.py b/rebuild_v1/engine_llm.py new file mode 100644 index 00000000..46092b57 --- /dev/null +++ b/rebuild_v1/engine_llm.py @@ -0,0 +1,151 @@ +import json +from typing import Dict, List, Optional + +import requests +from openai import OpenAI + + +class LLMError(Exception): + """Raised when the language model call fails.""" + + +class LLMEngine: + def __init__(self, config: Dict[str, object]): + self.config = config + + def _ollama_chat(self, messages: List[Dict[str, str]]) -> str: + host = str(self.config.get("ollama_host") or "http://localhost:11434") + model = str(self.config.get("ollama_model") or "llama3.2:3b") + try: + health = requests.get(f"{host}/api/tags", timeout=3) + except requests.RequestException as exc: + raise LLMError( + "Could not reach Ollama. Please ensure it is running on this machine." + ) from exc + + if health.status_code != 200: + raise LLMError( + "Ollama responded unexpectedly. Please restart Ollama and try again." + ) + + payload = { + "model": model, + "messages": messages, + "stream": False, + } + try: + response = requests.post( + f"{host}/api/chat", json=payload, timeout=30, stream=False + ) + except requests.RequestException as exc: + raise LLMError( + "Failed to call Ollama. Is the service running on the configured host?" + ) from exc + + if response.status_code == 404: + raise LLMError( + "Model not found. Please install it with: ollama pull llama3.2:3b" + ) + if response.status_code >= 500: + raise LLMError("Ollama server error. Please try again after a moment.") + if response.status_code >= 400: + raise LLMError( + "Ollama rejected the request. If the model is missing, run: ollama pull llama3.2:3b" + ) + + data = response.json() + message = data.get("message", {}) + content = message.get("content") + if not content: + raise LLMError("Ollama returned an empty response.") + return content + + def _openrouter_chat(self, messages: List[Dict[str, str]]) -> str: + api_key = str(self.config.get("openrouter_api_key") or "") + model = str(self.config.get("openrouter_model") or "openrouter/auto") + base_url = str(self.config.get("openrouter_base_url") or "https://openrouter.ai/api/v1") + + if not api_key: + raise LLMError("OpenRouter API key is missing in settings.") + + client = OpenAI(api_key=api_key, base_url=base_url) + try: + chat = client.chat.completions.create( + model=model, + messages=messages, + stream=False, + ) + except Exception as exc: # pylint: disable=broad-except + raise LLMError("OpenRouter call failed. Check network and API key.") from exc + + try: + content = chat.choices[0].message.content + except (AttributeError, IndexError): + content = None + if not content: + raise LLMError("OpenRouter returned an empty response.") + return content + + def chat(self, messages: List[Dict[str, str]]) -> str: + provider = self.config.get("provider", "Ollama (Local)") + if provider == "Ollama (Local)": + return self._ollama_chat(messages) + if provider == "OpenRouter (API)": + return self._openrouter_chat(messages) + raise LLMError("Unsupported provider selected.") + + def request_action(self, objective: str, screenshot_note: str) -> str: + prompt = ( + "You are controlling a computer. Decide the SINGLE next action as strict JSON only. " + "Use one of: click, type, hotkey, wait, done. Respond with JSON only." + ) + instructions = ( + "Schema: {\"type\":\"click\",\"x\":int,\"y\":int} | " + "{\"type\":\"type\",\"text\":str} | " + "{\"type\":\"hotkey\",\"keys\":[str,...]} | " + "{\"type\":\"wait\",\"seconds\":float} | " + "{\"type\":\"done\",\"reason\":str}. " + "Only one action. No explanation." + ) + user_message = ( + f"Objective: {objective}\n" + f"Latest screenshot: {screenshot_note}\n" + "Output JSON only." + ) + messages = [ + {"role": "system", "content": prompt}, + {"role": "user", "content": instructions}, + {"role": "user", "content": user_message}, + ] + return self.chat(messages) + + +def parse_action_text(text: str) -> Optional[Dict[str, object]]: + try: + data = json.loads(text) + except json.JSONDecodeError: + return None + if not isinstance(data, dict): + return None + action_type = data.get("type") + if action_type not in {"click", "type", "hotkey", "wait", "done"}: + return None + if action_type == "click": + if isinstance(data.get("x"), int) and isinstance(data.get("y"), int): + return {"type": "click", "x": data["x"], "y": data["y"]} + elif action_type == "type": + if isinstance(data.get("text"), str): + return {"type": "type", "text": data["text"]} + elif action_type == "hotkey": + keys = data.get("keys") + if isinstance(keys, list) and all(isinstance(k, str) for k in keys): + return {"type": "hotkey", "keys": keys} + elif action_type == "wait": + seconds = data.get("seconds") + if isinstance(seconds, (int, float)): + return {"type": "wait", "seconds": float(seconds)} + elif action_type == "done": + reason = data.get("reason") + if isinstance(reason, str): + return {"type": "done", "reason": reason} + return None diff --git a/rebuild_v1/gui.py b/rebuild_v1/gui.py new file mode 100644 index 00000000..e3bcca15 --- /dev/null +++ b/rebuild_v1/gui.py @@ -0,0 +1,214 @@ +import threading +import tkinter as tk +from tkinter import ttk +from typing import Callable, Dict, List + +from config import load_config, save_config + + +class AppGUI: + def __init__( + self, + root: tk.Tk, + run_callback: Callable[[List[str]], None], + stop_callback: Callable[[], None], + log_export: Callable[[Callable[[str], None]], None], + ): + self.root = root + self.run_callback = run_callback + self.stop_callback = stop_callback + self.log_export = log_export + self.config = load_config() + self.tasks: List[str] = [] + + root.title("Self-Operating Computer v1") + root.geometry("760x520") + + notebook = ttk.Notebook(root) + notebook.pack(fill=tk.BOTH, expand=True) + + self.log_text = tk.Text(root, wrap="word", height=10, state=tk.DISABLED) + + self._build_tasks_tab(notebook) + self._build_settings_tab(notebook) + self._build_logs_tab(notebook) + + self.log_export(self.log) + + def _build_tasks_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Tasks") + + ttk.Label(frame, text="Objective").pack(anchor=tk.W, padx=8, pady=(8, 2)) + self.objective_input = tk.Text(frame, height=4) + self.objective_input.pack(fill=tk.X, padx=8) + + button_frame = ttk.Frame(frame) + button_frame.pack(fill=tk.X, padx=8, pady=6) + + ttk.Button(button_frame, text="Add Task", command=self.add_task).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Run", command=self.run_tasks).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Stop", command=self.stop_callback).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Clear", command=self.clear_tasks).pack( + side=tk.LEFT, padx=4 + ) + + self.dry_run_var = tk.BooleanVar(value=bool(self.config.get("dry_run", True))) + dry_run_check = ttk.Checkbutton( + frame, + text="Dry Run (do not execute actions)", + variable=self.dry_run_var, + command=self._persist_dry_run, + ) + dry_run_check.pack(anchor=tk.W, padx=8, pady=4) + + ttk.Label(frame, text="Task Queue").pack(anchor=tk.W, padx=8, pady=(10, 2)) + self.tasks_list = tk.Listbox(frame, height=8) + self.tasks_list.pack(fill=tk.BOTH, expand=True, padx=8, pady=(0, 8)) + + def _build_settings_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Settings") + + provider_frame = ttk.Frame(frame) + provider_frame.pack(fill=tk.X, padx=8, pady=6) + ttk.Label(provider_frame, text="Provider").pack(anchor=tk.W) + self.provider_var = tk.StringVar(value=str(self.config.get("provider", "Ollama (Local)"))) + provider_menu = ttk.Combobox( + provider_frame, + textvariable=self.provider_var, + values=["Ollama (Local)", "OpenRouter (API)"], + state="readonly", + ) + provider_menu.pack(fill=tk.X, pady=2) + provider_menu.bind("<>", lambda _event: self._toggle_provider_fields()) + + self.ollama_host_var = tk.StringVar(value=str(self.config.get("ollama_host"))) + self.ollama_model_var = tk.StringVar(value=str(self.config.get("ollama_model"))) + self.openrouter_key_var = tk.StringVar(value=str(self.config.get("openrouter_api_key"))) + self.openrouter_model_var = tk.StringVar(value=str(self.config.get("openrouter_model"))) + self.openrouter_base_var = tk.StringVar(value=str(self.config.get("openrouter_base_url"))) + + self.provider_container = ttk.Frame(frame) + self.provider_container.pack(fill=tk.X, padx=8, pady=4) + self._build_provider_fields() + + extras = ttk.Frame(frame) + extras.pack(fill=tk.X, padx=8, pady=6) + + ttk.Label(extras, text="Max steps").pack(anchor=tk.W) + self.max_steps_var = tk.IntVar(value=int(self.config.get("max_steps", 10))) + ttk.Entry(extras, textvariable=self.max_steps_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Delay between actions (seconds)").pack(anchor=tk.W, pady=(8, 0)) + self.delay_var = tk.DoubleVar(value=float(self.config.get("delay_seconds", 0.6))) + ttk.Entry(extras, textvariable=self.delay_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Stop hotkey (e.g., ctrl+alt+s)").pack(anchor=tk.W, pady=(8, 0)) + self.stop_hotkey_var = tk.StringVar(value=str(self.config.get("stop_hotkey", "ctrl+alt+s"))) + ttk.Entry(extras, textvariable=self.stop_hotkey_var).pack(fill=tk.X, pady=2) + + ttk.Button(frame, text="Save Settings", command=self.save_settings).pack( + padx=8, pady=10, anchor=tk.E + ) + + def _build_provider_fields(self) -> None: + for child in list(self.provider_container.winfo_children()): + child.destroy() + + provider = self.provider_var.get() + if provider == "Ollama (Local)": + ttk.Label(self.provider_container, text="Ollama Host").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.ollama_host_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="Ollama Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.ollama_model_var).pack( + fill=tk.X, pady=2 + ) + else: + ttk.Label(self.provider_container, text="OpenRouter API Key").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.openrouter_key_var, show="*").pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_model_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Base URL").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_base_var).pack( + fill=tk.X, pady=2 + ) + + def _build_logs_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Logs") + scrollbar = ttk.Scrollbar(frame) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + self.log_text = tk.Text(frame, wrap="word", state=tk.DISABLED) + self.log_text.pack(fill=tk.BOTH, expand=True) + self.log_text.config(yscrollcommand=scrollbar.set) + scrollbar.config(command=self.log_text.yview) + + def add_task(self) -> None: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + self.objective_input.delete("1.0", tk.END) + + def clear_tasks(self) -> None: + self.tasks.clear() + self.tasks_list.delete(0, tk.END) + + def run_tasks(self) -> None: + if not self.tasks: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + if not self.tasks: + self.log("No tasks to run.") + return + threading.Thread(target=self.run_callback, args=(self.tasks.copy(),), daemon=True).start() + + def _persist_dry_run(self) -> None: + self.config["dry_run"] = self.dry_run_var.get() + save_config(self.config) + + def save_settings(self) -> None: + self.config.update( + { + "provider": self.provider_var.get(), + "ollama_host": self.ollama_host_var.get(), + "ollama_model": self.ollama_model_var.get(), + "openrouter_api_key": self.openrouter_key_var.get(), + "openrouter_model": self.openrouter_model_var.get(), + "openrouter_base_url": self.openrouter_base_var.get(), + "max_steps": self.max_steps_var.get(), + "delay_seconds": self.delay_var.get(), + "stop_hotkey": self.stop_hotkey_var.get(), + "dry_run": self.dry_run_var.get(), + } + ) + save_config(self.config) + self.log("Settings saved.") + self._build_provider_fields() + + def _toggle_provider_fields(self) -> None: + self.save_settings() + self._build_provider_fields() + + def log(self, message: str) -> None: + self.log_text.configure(state=tk.NORMAL) + self.log_text.insert(tk.END, message + "\n") + self.log_text.see(tk.END) + self.log_text.configure(state=tk.DISABLED) + + diff --git a/rebuild_v1/main.py b/rebuild_v1/main.py new file mode 100644 index 00000000..36a0821b --- /dev/null +++ b/rebuild_v1/main.py @@ -0,0 +1,67 @@ +import tkinter as tk +from typing import Callable, List + +from automation import ActionLooper, AutomationEngine +from config import load_config +from engine_llm import LLMEngine, LLMError, parse_action_text +from gui import AppGUI + + +class AppController: + def __init__(self, root: tk.Tk): + self.root = root + self.logger: Callable[[str], None] = lambda msg: None + self.config = load_config() + self.automation = AutomationEngine( + dry_run=bool(self.config.get("dry_run", True)), + stop_hotkey=str(self.config.get("stop_hotkey", "ctrl+alt+s")), + ) + self.gui = AppGUI(root, self.run_tasks, self.stop, self.export_logger) + + def export_logger(self, logger: Callable[[str], None]) -> None: + self.logger = logger + + def log(self, message: str) -> None: + self.logger(message) + + def stop(self) -> None: + self.automation.stop() + self.log("Stop signal sent.") + + def run_tasks(self, tasks: List[str]) -> None: + for index, objective in enumerate(tasks, start=1): + self.config = load_config() + self.automation.dry_run = bool(self.config.get("dry_run", True)) + self.automation.stop_hotkey = str(self.config.get("stop_hotkey", "ctrl+alt+s")) + llm = LLMEngine(self.config) + looper = ActionLooper( + automation=self.automation, + request_action=llm.request_action, + parse_action=parse_action_text, + max_steps=int(self.config.get("max_steps", 10)), + delay_seconds=float(self.config.get("delay_seconds", 0.6)), + log_callback=self.log, + ) + self.log(f"Running task {index}/{len(tasks)}: {objective}") + try: + outcome = looper.run(objective) + except LLMError as exc: + self.log(f"LLM error: {exc}") + break + except Exception as exc: # pylint: disable=broad-except + self.log(f"Unexpected error: {exc}") + break + self.log(f"Task result: {outcome}") + if self.automation.should_stop(): + self.log("Stopped before finishing all tasks.") + break + + +def main() -> None: + root = tk.Tk() + AppController(root) + root.mainloop() + + +if __name__ == "__main__": + main() diff --git a/rebuild_v1/requirements.txt b/rebuild_v1/requirements.txt new file mode 100644 index 00000000..2aa9e07e --- /dev/null +++ b/rebuild_v1/requirements.txt @@ -0,0 +1,6 @@ +openai +requests +python-dotenv +pillow +pyautogui +keyboard From 4a4c703ab3f4a8f738be93e870890dab927b8b9c Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Tue, 16 Dec 2025 22:50:31 +0800 Subject: [PATCH 7/8] Handle hotkey strings and refine OpenRouter prompt --- rebuild_v1/.env.example | 2 + rebuild_v1/.gitignore | 5 + rebuild_v1/README.md | 62 ++++++- rebuild_v1/automation.py | 271 ++++++++++++++++++++++++++++++ rebuild_v1/config.py | 46 +++++ rebuild_v1/engine_llm.py | 323 ++++++++++++++++++++++++++++++++++++ rebuild_v1/gui.py | 301 +++++++++++++++++++++++++++++++++ rebuild_v1/main.py | 114 +++++++++++++ rebuild_v1/requirements.txt | 6 + 9 files changed, 1129 insertions(+), 1 deletion(-) create mode 100644 rebuild_v1/.env.example create mode 100644 rebuild_v1/.gitignore create mode 100644 rebuild_v1/automation.py create mode 100644 rebuild_v1/config.py create mode 100644 rebuild_v1/engine_llm.py create mode 100644 rebuild_v1/gui.py create mode 100644 rebuild_v1/main.py create mode 100644 rebuild_v1/requirements.txt diff --git a/rebuild_v1/.env.example b/rebuild_v1/.env.example new file mode 100644 index 00000000..38354282 --- /dev/null +++ b/rebuild_v1/.env.example @@ -0,0 +1,2 @@ +# Optional: set your OpenRouter API key for Phase B +OPENROUTER_API_KEY= diff --git a/rebuild_v1/.gitignore b/rebuild_v1/.gitignore new file mode 100644 index 00000000..404642ea --- /dev/null +++ b/rebuild_v1/.gitignore @@ -0,0 +1,5 @@ +config.json +.env +.venv +__pycache__/ +*.pyc diff --git a/rebuild_v1/README.md b/rebuild_v1/README.md index c1e3b52e..f575297d 100644 --- a/rebuild_v1/README.md +++ b/rebuild_v1/README.md @@ -1 +1,61 @@ -HI +A minimal Windows-friendly rewrite that runs a decide-and-act loop with a local Ollama model (no API keys required). Phase B wiring for OpenRouter is also in place. + +## Features +- Tkinter GUI with Tasks, Settings, and Logs tabs +- Provider modes: Ollama Text, Ollama Vision, OpenRouter API +- Safety toggles: Dry Run (default ON), Confirm Before Execute, Block Clicks, Block Terminal Typing, STOP hotkey/button +- Loop: screenshot → LLM JSON action → validate → (optionally) execute via `pyautogui` +- Config persisted to `rebuild_v1/config.json` + +## Requirements +- Python 3.11 on Windows +- [Ollama](https://ollama.com) running locally (default host `http://localhost:11434`) +- Optional: OpenRouter API key for Phase B + +Install Python packages: + +```bash +python -m venv .venv +.venv\\Scripts\\activate +pip install -r rebuild_v1/requirements.txt +``` + +## Running the app +```bash +python -m rebuild_v1.main +``` +If Tkinter fails to start in a headless environment, run on a local desktop session. + +## Recommended modes +- **Local Text**: fastest, safest with clicks blocked by default. Ideal when you want conservative hotkeys/typing. +- **Local Vision**: needs a vision model; may be slower on some PCs. Increase timeout (600s+) if the model is large. +- **OpenRouter API**: highest accuracy if you have an API key. + +Pull the default Ollama models before running: +```bash +ollama pull llama3.2:3b +ollama pull llava:7b +``` + +## Using Ollama (Phase A) +1. Start Ollama: `ollama serve` +2. Pick "Ollama (Local Text)" or "Ollama (Local Vision)" in Settings and adjust models/host if needed. +3. Vision mode sends the latest screenshot as base64 to the model. + +The app checks connectivity to Ollama and shows a friendly message if the server or model is unavailable. + +## Safety controls +- **Dry Run**: enabled by default; shows actions without executing. +- **Confirm Before Execute**: prompts before any real action; terminal windows require double-confirm for typing/hotkeys. +- **Block Clicks**: skips click actions and asks the model again. +- **Block Typing In Terminals**: prevents destructive commands unless you confirm twice. +- **STOP hotkey**: `Ctrl+Alt+S` (configurable) registered via the `keyboard` library. +- **STOP button**: halts the current loop. +- **Max steps**, **delay**, **timeout** configurable in Settings. + +## OpenRouter (Phase B) +A provider dropdown exists; if "OpenRouter (API)" is chosen, set your API key, model, and base URL in Settings. Requests use the OpenAI-compatible SDK. + +## Notes +- Screenshots use Pillow's `ImageGrab`; ensure a visible desktop session. +- Avoid sharing `config.json`—it is gitignored and may contain sensitive keys. diff --git a/rebuild_v1/automation.py b/rebuild_v1/automation.py new file mode 100644 index 00000000..9116f883 --- /dev/null +++ b/rebuild_v1/automation.py @@ -0,0 +1,271 @@ +import threading +import time +from dataclasses import dataclass +from typing import Callable, Dict, Optional, Tuple + +import keyboard +import pyautogui +import tkinter as tk +from PIL import ImageGrab +from tkinter import messagebox + +pyautogui.FAILSAFE = False + + +@dataclass +class ActionResult: + action: Dict[str, object] + executed: bool + error: Optional[str] = None + + +class AutomationEngine: + def __init__( + self, + dry_run: bool = True, + stop_hotkey: str = "ctrl+alt+s", + confirm_before_execute: bool = True, + block_clicks: bool = True, + block_terminal_typing: bool = True, + root: Optional[tk.Tk] = None, + ): + self.dry_run = dry_run + self.stop_hotkey = stop_hotkey + self.confirm_before_execute = confirm_before_execute + self.block_clicks = block_clicks + self.block_terminal_typing = block_terminal_typing + self._stop_flag = threading.Event() + self._hotkey_registered = False + self.log_callback: Callable[[str], None] = lambda _msg: None + self.root = root + self.screen_size: Optional[Tuple[int, int]] = None + + def set_logger(self, logger: Callable[[str], None]) -> None: + self.log_callback = logger + + def update_settings( + self, + *, + dry_run: Optional[bool] = None, + stop_hotkey: Optional[str] = None, + confirm_before_execute: Optional[bool] = None, + block_clicks: Optional[bool] = None, + block_terminal_typing: Optional[bool] = None, + ) -> None: + if dry_run is not None: + self.dry_run = dry_run + if stop_hotkey is not None: + self.stop_hotkey = stop_hotkey + if confirm_before_execute is not None: + self.confirm_before_execute = confirm_before_execute + if block_clicks is not None: + self.block_clicks = block_clicks + if block_terminal_typing is not None: + self.block_terminal_typing = block_terminal_typing + + def register_stop_hotkey(self) -> None: + if self._hotkey_registered: + return + try: + keyboard.add_hotkey(self.stop_hotkey, self.stop) + self._hotkey_registered = True + except keyboard.KeyboardException: + # Keyboard may need elevated privileges; ignore if unavailable + pass + + def stop(self) -> None: + self._stop_flag.set() + + def reset_stop(self) -> None: + self._stop_flag.clear() + + def should_stop(self) -> bool: + return self._stop_flag.is_set() + + def set_screen_size(self, size: Tuple[int, int]) -> None: + self.screen_size = size + + def capture_screenshot(self, save_path: Optional[str] = None): + image = ImageGrab.grab() + path = save_path + if save_path: + image.save(save_path) + return image, path + + def validate_action( + self, action: Dict[str, object], screen_size: Optional[Tuple[int, int]] + ) -> Tuple[bool, Optional[str], bool]: + action_type = action.get("type") + if action_type == "click": + if self.block_clicks: + return False, "Clicks are blocked by safety settings.", True + if screen_size: + width, height = screen_size + x, y = int(action.get("x", -1)), int(action.get("y", -1)) + if x < 0 or y < 0 or x >= width or y >= height: + return False, "Click coordinates out of bounds.", True + return True, None, False + + def _confirm_action(self, description: str, double_confirm: bool = False) -> bool: + if self.root is None: + return True + proceed = messagebox.askyesno("Confirm Action", description) + if not proceed: + return False + if double_confirm: + proceed = messagebox.askyesno( + "Confirm Again", f"Are you absolutely sure? {description}" + ) + return proceed + + def _active_window_title(self) -> str: + try: + win = pyautogui.getActiveWindow() + if win and getattr(win, "title", None): + return str(win.title) + except Exception: # pylint: disable=broad-except + pass + try: + title = pyautogui.getActiveWindowTitle() + if title: + return str(title) + except Exception: # pylint: disable=broad-except + pass + return "" + + def _is_terminal_window(self) -> bool: + title = self._active_window_title().lower() + for keyword in ["powershell", "command prompt", "cmd", "terminal", "bash", "zsh"]: + if keyword in title: + return True + return False + + def execute_action(self, action: Dict[str, object]) -> ActionResult: + if self.dry_run: + return ActionResult(action=action, executed=False, error=None) + + action_type = action.get("type") + description = f"Execute action: {action}" + + if self.confirm_before_execute: + if not self._confirm_action(description): + return ActionResult(action=action, executed=False, error="User cancelled action") + + if action_type in {"type", "hotkey"} and self.block_terminal_typing and self._is_terminal_window(): + if not self._confirm_action( + "Action targets a terminal-like window. Confirm twice to proceed.", double_confirm=True + ): + return ActionResult(action=action, executed=False, error="Blocked in terminal window") + + try: + if action_type == "click": + pyautogui.click(x=int(action["x"]), y=int(action["y"])) + elif action_type == "type": + pyautogui.typewrite(str(action["text"])) + elif action_type == "hotkey": + keys = [str(k) for k in action.get("keys", [])] + pyautogui.hotkey(*keys) + elif action_type == "wait": + time.sleep(float(action.get("seconds", 0))) + elif action_type == "done": + pass + else: + return ActionResult(action=action, executed=False, error="Unknown action type") + return ActionResult(action=action, executed=True, error=None) + except Exception as exc: # pylint: disable=broad-except + return ActionResult(action=action, executed=False, error=str(exc)) + + +class ActionLooper: + def __init__( + self, + automation: AutomationEngine, + request_action: Callable[..., str], + parse_action: Callable[[str, Optional[Callable[[str], None]]], Optional[Dict[str, object]]], + max_steps: int, + delay_seconds: float, + log_callback: Callable[[str], None], + ): + self.automation = automation + self.request_action = request_action + self.parse_action = parse_action + self.max_steps = max_steps + self.delay_seconds = delay_seconds + self.log_callback = log_callback + + def run(self, objective: str) -> str: + self.automation.reset_stop() + self.automation.register_stop_hotkey() + screen = pyautogui.size() + self.automation.set_screen_size((screen.width, screen.height)) + self.log_callback(f"Screen size detected: {screen.width}x{screen.height}") + + for step in range(1, self.max_steps + 1): + if self.automation.should_stop(): + return "Stopped by user" + + try: + image, _ = self.automation.capture_screenshot() + screenshot_note = f"screenshot captured ({image.width}x{image.height})" + self.log_callback(f"Screenshot captured for step {step}: {image.width}x{image.height}") + except Exception as exc: # pylint: disable=broad-except + screenshot_note = f"screenshot failed: {exc}" + self.log_callback(f"Screenshot failed: {exc}") + image = None + + screenshot_bytes = None + if image: + try: + from io import BytesIO + + buffer = BytesIO() + image.save(buffer, format="PNG") + screenshot_bytes = buffer.getvalue() + except Exception as exc: # pylint: disable=broad-except + self.log_callback(f"Failed to serialize screenshot: {exc}") + + retries = 0 + action_data: Optional[Dict[str, object]] = None + reask_guard = 0 + while retries < 3 and not self.automation.should_stop(): + raw = self.request_action( + objective, + screenshot_note=screenshot_note, + screenshot_bytes=screenshot_bytes, + screen_size=f"{screen.width}x{screen.height}", + ) + self.log_callback(f"Raw model response: {raw}") + action_data = self.parse_action(raw, self.log_callback) + if action_data is None: + retries += 1 + self.log_callback("Model returned invalid JSON. Retrying...") + continue + + valid, reason, reask = self.automation.validate_action(action_data, (screen.width, screen.height)) + if not valid: + self.log_callback(reason or "Action rejected") + if reask: + reask_guard += 1 + if reask_guard >= 3: + return "Action rejected repeatedly" + continue + break + + if action_data is None: + return "Failed to parse action after retries" + + result = self.automation.execute_action(action_data) + executed_text = "executed" if result.executed else "dry-run" + self.log_callback(f"Action step {step}: {action_data} ({executed_text})") + if result.error: + self.log_callback(f"Action error: {result.error}") + if result.error.startswith("User cancelled"): + return "User cancelled action" + + if action_data.get("type") == "done": + return str(action_data.get("reason", "Done")) + + if self.automation.should_stop(): + return "Stopped by user" + time.sleep(self.delay_seconds) + return "Reached max steps" diff --git a/rebuild_v1/config.py b/rebuild_v1/config.py new file mode 100644 index 00000000..480b9eb6 --- /dev/null +++ b/rebuild_v1/config.py @@ -0,0 +1,46 @@ +import json +from pathlib import Path +from typing import Any, Dict + +CONFIG_PATH = Path(__file__).parent / "config.json" + +DEFAULT_CONFIG: Dict[str, Any] = { + "provider_mode": "Ollama (Local Text)", + "ollama_host": "http://localhost:11434", + "ollama_text_model": "llama3.2:3b", + "ollama_vision_model": "llava:7b", + "openrouter_api_key": "", + "openrouter_model": "openrouter/auto", + "openrouter_base_url": "https://openrouter.ai/api/v1", + "llm_timeout_seconds": 600, + "max_steps": 20, + "delay_seconds": 0.6, + "stop_hotkey": "ctrl+alt+s", + "dry_run": True, + "confirm_before_execute": True, + "block_clicks": True, + "block_terminal_typing": True, +} + + +def load_config() -> Dict[str, Any]: + if CONFIG_PATH.exists(): + try: + with CONFIG_PATH.open("r", encoding="utf-8") as f: + data = json.load(f) + merged = DEFAULT_CONFIG.copy() + if isinstance(data, dict): + merged.update(data) + return merged + except (json.JSONDecodeError, OSError, TypeError): + return DEFAULT_CONFIG.copy() + return DEFAULT_CONFIG.copy() + + +def save_config(config: Dict[str, Any]) -> None: + try: + with CONFIG_PATH.open("w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + except OSError: + # Prefer silent failure over crashing the UI when filesystem is read-only + pass diff --git a/rebuild_v1/engine_llm.py b/rebuild_v1/engine_llm.py new file mode 100644 index 00000000..9c89d0ca --- /dev/null +++ b/rebuild_v1/engine_llm.py @@ -0,0 +1,323 @@ +import base64 +import io +import json +from typing import Callable, Dict, List, Optional + +import requests +from openai import OpenAI + + +class LLMError(Exception): + """Raised when the language model call fails.""" + + +def _first_json_object(text: str) -> Optional[Dict[str, object]]: + decoder = json.JSONDecoder() + idx = text.find("{") + while idx != -1: + try: + obj, _ = decoder.raw_decode(text[idx:]) + if isinstance(obj, dict): + if idx > 0: + # leading noise; still return first valid object + return obj + return obj + except json.JSONDecodeError: + pass + idx = text.find("{", idx + 1) + return None + + +def parse_action_text(text: str, logger: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, object]]: + """Parse first JSON object describing an action. Logs when multiple objects are present.""" + try: + data = json.loads(text) + extra_text = None + except json.JSONDecodeError: + data = _first_json_object(text) + extra_text = text + else: + if isinstance(text, str) and text.strip().endswith("}"): + extra_text = None + else: + extra_text = text + + if data is None: + return None + + if extra_text and logger: + logger("Model returned extra text; using first JSON object only.") + + if not isinstance(data, dict): + return None + action_type = data.get("type") + if action_type not in {"click", "type", "hotkey", "wait", "done"}: + return None + if action_type == "click": + if isinstance(data.get("x"), int) and isinstance(data.get("y"), int): + return {"type": "click", "x": data["x"], "y": data["y"]} + elif action_type == "type": + if isinstance(data.get("text"), str): + return {"type": "type", "text": data["text"]} + elif action_type == "hotkey": + keys = data.get("keys") + if isinstance(keys, str): + keys = [keys] + if isinstance(keys, list): + normalized = [] + for key in keys: + if isinstance(key, str) and "+" in key: + normalized.extend(part.strip() for part in key.split("+") if part.strip()) + elif isinstance(key, str): + normalized.append(key) + if normalized: + return {"type": "hotkey", "keys": normalized} + elif action_type == "wait": + seconds = data.get("seconds") + if isinstance(seconds, (int, float)): + return {"type": "wait", "seconds": float(seconds)} + elif action_type == "done": + reason = data.get("reason") + if isinstance(reason, str): + return {"type": "done", "reason": reason} + return None + + +class LLMEngine: + def __init__(self, config: Dict[str, object], logger: Optional[Callable[[str], None]] = None): + self.config = config + self.logger = logger or (lambda _msg: None) + + def _raise(self, provider_mode: str, endpoint: str, model: str, exc: Exception) -> LLMError: + return LLMError( + f"[{provider_mode}] call to {endpoint} with model '{model}' failed: {repr(exc)}" + ) + + def _ollama_chat( + self, + messages: List[Dict[str, object]], + model: str, + host: str, + timeout: float, + provider_mode: str, + ) -> str: + try: + health = requests.get(f"{host}/api/tags", timeout=5) + except requests.RequestException as exc: + raise self._raise(provider_mode, f"{host}/api/tags", model, exc) from exc + + if health.status_code != 200: + raise LLMError( + f"[{provider_mode}] Ollama responded with status {health.status_code}; is it running?" + ) + + payload = {"model": model, "messages": messages, "stream": False} + try: + response = requests.post( + f"{host}/api/chat", json=payload, timeout=timeout, stream=False + ) + except requests.RequestException as exc: + raise self._raise(provider_mode, f"{host}/api/chat", model, exc) from exc + + if response.status_code == 404: + raise LLMError( + f"[{provider_mode}] Model '{model}' not found. Install with: ollama pull {model}" + ) + if response.status_code >= 400: + raise LLMError( + f"[{provider_mode}] Ollama returned {response.status_code}: {response.text}" + ) + + try: + data = response.json() + except ValueError as exc: # pragma: no cover - defensive + raise self._raise(provider_mode, f"{host}/api/chat", model, exc) from exc + + message = data.get("message", {}) + content = message.get("content") + if not content: + raise LLMError(f"[{provider_mode}] Ollama returned an empty response.") + return content + + def _openrouter_chat( + self, messages: List[Dict[str, object]], model: str, base_url: str, api_key: str, timeout: float + ) -> str: + if not api_key: + raise LLMError("[OpenRouter (API)] Missing API key in settings.") + + client = OpenAI(api_key=api_key, base_url=base_url) + try: + chat = client.chat.completions.create( + model=model, + messages=messages, + stream=False, + timeout=timeout, + ) + except Exception as exc: # pylint: disable=broad-except + raise LLMError( + f"[OpenRouter (API)] call to {base_url} with model '{model}' failed: {repr(exc)}" + ) from exc + + try: + content = chat.choices[0].message.content + except (AttributeError, IndexError): # pragma: no cover - defensive + content = None + if not content: + raise LLMError("[OpenRouter (API)] returned an empty response.") + return content + + def request_action( + self, + objective: str, + screenshot_note: str, + screenshot_bytes: Optional[bytes] = None, + screen_size: Optional[str] = None, + ) -> str: + provider_mode = str(self.config.get("provider_mode", "Ollama (Local Text)")) + timeout = float(self.config.get("llm_timeout_seconds", 600)) + common_schema = ( + "Allowed actions as ONE JSON object: " + '{"type":"click","x":int,"y":int} | ' + '{"type":"type","text":str} | ' + '{"type":"hotkey","keys":[str,...]} | ' + '{"type":"wait","seconds":float} | ' + '{"type":"done","reason":str}.' + ) + examples = ( + "Examples: {\"type\":\"hotkey\",\"keys\":[\"win\"]} | " + "{\"type\":\"type\",\"text\":\"notepad\"} | " + "{\"type\":\"click\",\"x\":120,\"y\":220} | " + "{\"type\":\"wait\",\"seconds\":1.0} | " + "{\"type\":\"done\",\"reason\":\"finished\"}." + ) + safe_text_rules = ( + "In text-only mode avoid free clicking. Prefer hotkeys, typing, waits, Win search, " + "Ctrl+L for address bar, then type and Enter. OUTPUT JSON ONLY." + ) + vision_rules = ( + "You may click visible elements, but still prefer reliable hotkeys and typing when possible. " + "OUTPUT JSON ONLY." + ) + user_block = ( + f"Objective: {objective}\n" + f"Latest screenshot info: {screenshot_note}\n" + f"Screen size: {screen_size or 'unknown'}\n" + "Reply with exactly one JSON object." + ) + + messages: List[Dict[str, object]] = [ + { + "role": "system", + "content": ( + "You are controlling a computer. Decide the single next action. " + + common_schema + + " " + + examples + ), + } + ] + + if provider_mode == "Ollama (Local Text)": + messages.append({"role": "user", "content": safe_text_rules}) + messages.append({"role": "user", "content": user_block}) + return self._ollama_chat( + messages, + model=str(self.config.get("ollama_text_model", "llama3.2:3b")), + host=str(self.config.get("ollama_host", "http://localhost:11434")), + timeout=timeout, + provider_mode=provider_mode, + ) + + if provider_mode == "Ollama (Local Vision)": + if screenshot_bytes is None: + raise LLMError("Vision mode requires screenshot bytes.") + b64_image = base64.b64encode(screenshot_bytes).decode("utf-8") + messages.append({"role": "user", "content": vision_rules}) + messages.append( + { + "role": "user", + "content": user_block, + "images": [b64_image], + } + ) + try: + return self._ollama_chat( + messages, + model=str(self.config.get("ollama_vision_model", "llava:7b")), + host=str(self.config.get("ollama_host", "http://localhost:11434")), + timeout=timeout, + provider_mode=provider_mode, + ) + except LLMError as exc: + if "images" in str(exc).lower(): + self.logger( + "The selected Ollama model may not support images. Try pulling llava:7b and set it in settings." + ) + raise + + if provider_mode == "OpenRouter (API)": + messages.append({"role": "user", "content": safe_text_rules}) + messages.append({"role": "user", "content": user_block}) + return self._openrouter_chat( + messages, + model=str(self.config.get("openrouter_model", "openrouter/auto")), + base_url=str(self.config.get("openrouter_base_url", "https://openrouter.ai/api/v1")), + api_key=str(self.config.get("openrouter_api_key", "")), + timeout=timeout, + ) + + raise LLMError(f"Unsupported provider mode: {provider_mode}") + + +def test_provider(config: Dict[str, object]) -> str: + engine = LLMEngine(config) + provider_mode = str(config.get("provider_mode", "Ollama (Local Text)")) + timeout = float(config.get("llm_timeout_seconds", 600)) + if provider_mode.startswith("Ollama"): + host = str(config.get("ollama_host", "http://localhost:11434")) + model = ( + str(config.get("ollama_text_model", "llama3.2:3b")) + if provider_mode == "Ollama (Local Text)" + else str(config.get("ollama_vision_model", "llava:7b")) + ) + try: + tags = requests.get(f"{host}/api/tags", timeout=5) + tags.raise_for_status() + except Exception as exc: # pylint: disable=broad-except + return f"Ollama connectivity failed: {repr(exc)}" + + payload_messages: List[Dict[str, object]] = [ + { + "role": "user", + "content": "Reply OK", + } + ] + if provider_mode == "Ollama (Local Vision)": + img = io.BytesIO() + from PIL import Image # lazy import to avoid overhead if unused + + Image.new("RGB", (2, 2), color="black").save(img, format="PNG") + img_bytes = img.getvalue() + payload_messages[0]["images"] = [base64.b64encode(img_bytes).decode("utf-8")] + try: + reply = engine._ollama_chat( # pylint: disable=protected-access + payload_messages, model=model, host=host, timeout=timeout, provider_mode=provider_mode + ) + except Exception as exc: # pylint: disable=broad-except + return f"Ollama chat failed: {repr(exc)}" + return f"Ollama test succeeded: {reply}" if reply else "Ollama test returned empty response." + + if provider_mode == "OpenRouter (API)": + try: + reply = engine._openrouter_chat( # pylint: disable=protected-access + messages=[{"role": "user", "content": "Reply OK"}], + model=str(config.get("openrouter_model", "openrouter/auto")), + base_url=str(config.get("openrouter_base_url", "https://openrouter.ai/api/v1")), + api_key=str(config.get("openrouter_api_key", "")), + timeout=timeout, + ) + except Exception as exc: # pylint: disable=broad-except + return f"OpenRouter test failed: {repr(exc)}" + return f"OpenRouter test succeeded: {reply}" if reply else "OpenRouter test returned empty response." + + return f"Unsupported provider mode: {provider_mode}" diff --git a/rebuild_v1/gui.py b/rebuild_v1/gui.py new file mode 100644 index 00000000..97c0f828 --- /dev/null +++ b/rebuild_v1/gui.py @@ -0,0 +1,301 @@ +import threading +import tkinter as tk +from tkinter import ttk +from typing import Callable, Dict, List + +from config import load_config, save_config + + +class AppGUI: + def __init__( + self, + root: tk.Tk, + run_callback: Callable[[List[str]], None], + stop_callback: Callable[[], None], + log_export: Callable[[Callable[[str], None]], None], + test_llm_callback: Callable[[], None], + test_screenshot_callback: Callable[[], None], + ): + self.root = root + self.run_callback = run_callback + self.stop_callback = stop_callback + self.log_export = log_export + self.test_llm_callback = test_llm_callback + self.test_screenshot_callback = test_screenshot_callback + self.config = load_config() + self.tasks: List[str] = [] + + root.title("Self-Operating Computer v1") + root.geometry("900x620") + + notebook = ttk.Notebook(root) + notebook.pack(fill=tk.BOTH, expand=True) + + self.log_text = tk.Text(root, wrap="word", height=10, state=tk.DISABLED) + + self._build_tasks_tab(notebook) + self._build_settings_tab(notebook) + self._build_logs_tab(notebook) + + self.log_export(self.log) + + def _build_tasks_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Tasks") + + ttk.Label(frame, text="Objective").pack(anchor=tk.W, padx=8, pady=(8, 2)) + self.objective_input = tk.Text(frame, height=4) + self.objective_input.pack(fill=tk.X, padx=8) + + button_frame = ttk.Frame(frame) + button_frame.pack(fill=tk.X, padx=8, pady=6) + + ttk.Button(button_frame, text="Add Task", command=self.add_task).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Run", command=self.run_tasks).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Stop", command=self.stop_callback).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Clear", command=self.clear_tasks).pack( + side=tk.LEFT, padx=4 + ) + + safety_frame = ttk.LabelFrame(frame, text="Safety") + safety_frame.pack(fill=tk.X, padx=8, pady=6) + + self.dry_run_var = tk.BooleanVar(value=bool(self.config.get("dry_run", True))) + ttk.Checkbutton( + safety_frame, + text="Dry Run (do not execute actions)", + variable=self.dry_run_var, + command=self._persist_dry_run, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.confirm_var = tk.BooleanVar( + value=bool(self.config.get("confirm_before_execute", True)) + ) + ttk.Checkbutton( + safety_frame, + text="Confirm Before Execute", + variable=self.confirm_var, + command=self._persist_confirm, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.block_clicks_var = tk.BooleanVar(value=bool(self.config.get("block_clicks", True))) + ttk.Checkbutton( + safety_frame, + text="Block Click Actions", + variable=self.block_clicks_var, + command=self._persist_block_clicks, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.block_terminal_var = tk.BooleanVar( + value=bool(self.config.get("block_terminal_typing", True)) + ) + ttk.Checkbutton( + safety_frame, + text="Block Typing In Terminals", + variable=self.block_terminal_var, + command=self._persist_block_terminal, + ).pack(anchor=tk.W, padx=6, pady=2) + + ttk.Label(frame, text="Task Queue").pack(anchor=tk.W, padx=8, pady=(10, 2)) + self.tasks_list = tk.Listbox(frame, height=10) + self.tasks_list.pack(fill=tk.BOTH, expand=True, padx=8, pady=(0, 8)) + + def _build_settings_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Settings") + + provider_frame = ttk.LabelFrame(frame, text="Provider") + provider_frame.pack(fill=tk.X, padx=8, pady=6) + ttk.Label(provider_frame, text="Provider Mode").pack(anchor=tk.W) + self.provider_var = tk.StringVar( + value=str(self.config.get("provider_mode", "Ollama (Local Text)")) + ) + provider_menu = ttk.Combobox( + provider_frame, + textvariable=self.provider_var, + values=[ + "Ollama (Local Text)", + "Ollama (Local Vision)", + "OpenRouter (API)", + ], + state="readonly", + ) + provider_menu.pack(fill=tk.X, pady=2) + provider_menu.bind("<>", lambda _event: self._toggle_provider_fields()) + + self.ollama_host_var = tk.StringVar(value=str(self.config.get("ollama_host"))) + self.ollama_text_model_var = tk.StringVar( + value=str(self.config.get("ollama_text_model")) + ) + self.ollama_vision_model_var = tk.StringVar( + value=str(self.config.get("ollama_vision_model")) + ) + self.openrouter_key_var = tk.StringVar(value=str(self.config.get("openrouter_api_key"))) + self.openrouter_model_var = tk.StringVar(value=str(self.config.get("openrouter_model"))) + self.openrouter_base_var = tk.StringVar( + value=str(self.config.get("openrouter_base_url")) + ) + + self.provider_container = ttk.Frame(provider_frame) + self.provider_container.pack(fill=tk.X, padx=4, pady=4) + self._build_provider_fields() + + extras = ttk.LabelFrame(frame, text="Run Settings") + extras.pack(fill=tk.X, padx=8, pady=6) + + ttk.Label(extras, text="LLM Timeout Seconds").pack(anchor=tk.W) + self.timeout_var = tk.IntVar(value=int(self.config.get("llm_timeout_seconds", 600))) + ttk.Entry(extras, textvariable=self.timeout_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Max steps").pack(anchor=tk.W, pady=(6, 0)) + self.max_steps_var = tk.IntVar(value=int(self.config.get("max_steps", 20))) + ttk.Entry(extras, textvariable=self.max_steps_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Delay between actions (seconds)").pack(anchor=tk.W, pady=(6, 0)) + self.delay_var = tk.DoubleVar(value=float(self.config.get("delay_seconds", 0.6))) + ttk.Entry(extras, textvariable=self.delay_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Stop hotkey (e.g., ctrl+alt+s)").pack(anchor=tk.W, pady=(6, 0)) + self.stop_hotkey_var = tk.StringVar( + value=str(self.config.get("stop_hotkey", "ctrl+alt+s")) + ) + ttk.Entry(extras, textvariable=self.stop_hotkey_var).pack(fill=tk.X, pady=2) + + button_bar = ttk.Frame(frame) + button_bar.pack(fill=tk.X, padx=8, pady=10) + ttk.Button(button_bar, text="Save Settings", command=self.save_settings).pack( + side=tk.RIGHT, padx=4 + ) + ttk.Button(button_bar, text="Test LLM", command=self._test_llm).pack( + side=tk.RIGHT, padx=4 + ) + ttk.Button(button_bar, text="Test Screenshot", command=self._test_screenshot).pack( + side=tk.RIGHT, padx=4 + ) + + def _build_provider_fields(self) -> None: + for child in list(self.provider_container.winfo_children()): + child.destroy() + + provider = self.provider_var.get() + if provider.startswith("Ollama"): + ttk.Label(self.provider_container, text="Ollama Host").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.ollama_host_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="Ollama Text Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.ollama_text_model_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="Ollama Vision Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.ollama_vision_model_var).pack( + fill=tk.X, pady=2 + ) + else: + ttk.Label(self.provider_container, text="OpenRouter API Key").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.openrouter_key_var, show="*").pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_model_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Base URL").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_base_var).pack( + fill=tk.X, pady=2 + ) + + def _build_logs_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Logs") + scrollbar = ttk.Scrollbar(frame) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + self.log_text = tk.Text(frame, wrap="word", state=tk.DISABLED) + self.log_text.pack(fill=tk.BOTH, expand=True) + self.log_text.config(yscrollcommand=scrollbar.set) + scrollbar.config(command=self.log_text.yview) + + def add_task(self) -> None: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + self.objective_input.delete("1.0", tk.END) + + def clear_tasks(self) -> None: + self.tasks.clear() + self.tasks_list.delete(0, tk.END) + + def run_tasks(self) -> None: + if not self.tasks: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + if not self.tasks: + self.log("No tasks to run.") + return + threading.Thread(target=self.run_callback, args=(self.tasks.copy(),), daemon=True).start() + + def _persist_dry_run(self) -> None: + self.config["dry_run"] = self.dry_run_var.get() + save_config(self.config) + + def _persist_confirm(self) -> None: + self.config["confirm_before_execute"] = self.confirm_var.get() + save_config(self.config) + + def _persist_block_clicks(self) -> None: + self.config["block_clicks"] = self.block_clicks_var.get() + save_config(self.config) + + def _persist_block_terminal(self) -> None: + self.config["block_terminal_typing"] = self.block_terminal_var.get() + save_config(self.config) + + def save_settings(self) -> None: + self.config.update( + { + "provider_mode": self.provider_var.get(), + "ollama_host": self.ollama_host_var.get(), + "ollama_text_model": self.ollama_text_model_var.get(), + "ollama_vision_model": self.ollama_vision_model_var.get(), + "openrouter_api_key": self.openrouter_key_var.get(), + "openrouter_model": self.openrouter_model_var.get(), + "openrouter_base_url": self.openrouter_base_var.get(), + "llm_timeout_seconds": self.timeout_var.get(), + "max_steps": self.max_steps_var.get(), + "delay_seconds": self.delay_var.get(), + "stop_hotkey": self.stop_hotkey_var.get(), + "dry_run": self.dry_run_var.get(), + "confirm_before_execute": self.confirm_var.get(), + "block_clicks": self.block_clicks_var.get(), + "block_terminal_typing": self.block_terminal_var.get(), + } + ) + save_config(self.config) + self.log("Settings saved.") + self._build_provider_fields() + + def _toggle_provider_fields(self) -> None: + self.save_settings() + self._build_provider_fields() + + def _test_llm(self) -> None: + self.save_settings() + self.test_llm_callback() + + def _test_screenshot(self) -> None: + self.test_screenshot_callback() + + def log(self, message: str) -> None: + self.log_text.configure(state=tk.NORMAL) + self.log_text.insert(tk.END, message + "\n") + self.log_text.see(tk.END) + self.log_text.configure(state=tk.DISABLED) diff --git a/rebuild_v1/main.py b/rebuild_v1/main.py new file mode 100644 index 00000000..44018abe --- /dev/null +++ b/rebuild_v1/main.py @@ -0,0 +1,114 @@ +import tempfile +import tkinter as tk +from typing import Callable, List + +from automation import ActionLooper, AutomationEngine +from config import load_config +from engine_llm import LLMEngine, LLMError, parse_action_text, test_provider +from gui import AppGUI + + +class AppController: + def __init__(self, root: tk.Tk): + self.root = root + self.logger: Callable[[str], None] = lambda msg: None + self.config = load_config() + self.automation = AutomationEngine( + dry_run=bool(self.config.get("dry_run", True)), + stop_hotkey=str(self.config.get("stop_hotkey", "ctrl+alt+s")), + confirm_before_execute=bool(self.config.get("confirm_before_execute", True)), + block_clicks=bool(self.config.get("block_clicks", True)), + block_terminal_typing=bool(self.config.get("block_terminal_typing", True)), + root=root, + ) + self.gui = AppGUI( + root, + self.run_tasks, + self.stop, + self.export_logger, + self.test_llm, + self.test_screenshot, + ) + + def export_logger(self, logger: Callable[[str], None]) -> None: + self.logger = logger + self.automation.set_logger(logger) + + def log(self, message: str) -> None: + self.logger(message) + + def stop(self) -> None: + self.automation.stop() + self.log("Stop signal sent.") + + def _log_provider_settings(self) -> None: + mode = str(self.config.get("provider_mode")) + timeout = self.config.get("llm_timeout_seconds") + if mode == "Ollama (Local Text)": + model = self.config.get("ollama_text_model") + host = self.config.get("ollama_host") + elif mode == "Ollama (Local Vision)": + model = self.config.get("ollama_vision_model") + host = self.config.get("ollama_host") + else: + model = self.config.get("openrouter_model") + host = self.config.get("openrouter_base_url") + self.log(f"Provider: {mode} | Model: {model} | Host: {host} | Timeout: {timeout}s") + + def run_tasks(self, tasks: List[str]) -> None: + for index, objective in enumerate(tasks, start=1): + self.config = load_config() + self.automation.update_settings( + dry_run=bool(self.config.get("dry_run", True)), + stop_hotkey=str(self.config.get("stop_hotkey", "ctrl+alt+s")), + confirm_before_execute=bool(self.config.get("confirm_before_execute", True)), + block_clicks=bool(self.config.get("block_clicks", True)), + block_terminal_typing=bool(self.config.get("block_terminal_typing", True)), + ) + llm = LLMEngine(self.config, logger=self.log) + looper = ActionLooper( + automation=self.automation, + request_action=llm.request_action, + parse_action=parse_action_text, + max_steps=int(self.config.get("max_steps", 20)), + delay_seconds=float(self.config.get("delay_seconds", 0.6)), + log_callback=self.log, + ) + self.log(f"Running task {index}/{len(tasks)}: {objective}") + self._log_provider_settings() + try: + outcome = looper.run(objective) + except LLMError as exc: + self.log(f"LLM error: {repr(exc)}") + break + except Exception as exc: # pylint: disable=broad-except + self.log(f"Unexpected error: {repr(exc)}") + break + self.log(f"Task result: {outcome}") + if self.automation.should_stop(): + self.log("Stopped before finishing all tasks.") + break + + def test_llm(self) -> None: + self.config = load_config() + result = test_provider(self.config) + self.log(result) + + def test_screenshot(self) -> None: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: + image, path = self.automation.capture_screenshot(save_path=tmp.name) + size = f"{image.width}x{image.height}" if image else "unknown" + self.log(f"Screenshot saved to {path} with size {size}") + except Exception as exc: # pylint: disable=broad-except + self.log(f"Screenshot test failed: {repr(exc)}") + + +def main() -> None: + root = tk.Tk() + AppController(root) + root.mainloop() + + +if __name__ == "__main__": + main() diff --git a/rebuild_v1/requirements.txt b/rebuild_v1/requirements.txt new file mode 100644 index 00000000..2aa9e07e --- /dev/null +++ b/rebuild_v1/requirements.txt @@ -0,0 +1,6 @@ +openai +requests +python-dotenv +pillow +pyautogui +keyboard From 340bee53e72851714473799555cba951212c8305 Mon Sep 17 00:00:00 2001 From: alikumale <149090479+alikumale@users.noreply.github.com> Date: Wed, 17 Dec 2025 10:28:40 +0800 Subject: [PATCH 8/8] Harden JSON parsing and prompts --- rebuild_v1/.env.example | 2 + rebuild_v1/.gitignore | 5 + rebuild_v1/README.md | 62 +++++- rebuild_v1/automation.py | 272 +++++++++++++++++++++++++++ rebuild_v1/config.py | 46 +++++ rebuild_v1/engine_llm.py | 363 ++++++++++++++++++++++++++++++++++++ rebuild_v1/gui.py | 301 ++++++++++++++++++++++++++++++ rebuild_v1/main.py | 114 +++++++++++ rebuild_v1/requirements.txt | 6 + rebuild_v1/test_parse.py | 23 +++ 10 files changed, 1193 insertions(+), 1 deletion(-) create mode 100644 rebuild_v1/.env.example create mode 100644 rebuild_v1/.gitignore create mode 100644 rebuild_v1/automation.py create mode 100644 rebuild_v1/config.py create mode 100644 rebuild_v1/engine_llm.py create mode 100644 rebuild_v1/gui.py create mode 100644 rebuild_v1/main.py create mode 100644 rebuild_v1/requirements.txt create mode 100644 rebuild_v1/test_parse.py diff --git a/rebuild_v1/.env.example b/rebuild_v1/.env.example new file mode 100644 index 00000000..38354282 --- /dev/null +++ b/rebuild_v1/.env.example @@ -0,0 +1,2 @@ +# Optional: set your OpenRouter API key for Phase B +OPENROUTER_API_KEY= diff --git a/rebuild_v1/.gitignore b/rebuild_v1/.gitignore new file mode 100644 index 00000000..404642ea --- /dev/null +++ b/rebuild_v1/.gitignore @@ -0,0 +1,5 @@ +config.json +.env +.venv +__pycache__/ +*.pyc diff --git a/rebuild_v1/README.md b/rebuild_v1/README.md index c1e3b52e..f575297d 100644 --- a/rebuild_v1/README.md +++ b/rebuild_v1/README.md @@ -1 +1,61 @@ -HI +A minimal Windows-friendly rewrite that runs a decide-and-act loop with a local Ollama model (no API keys required). Phase B wiring for OpenRouter is also in place. + +## Features +- Tkinter GUI with Tasks, Settings, and Logs tabs +- Provider modes: Ollama Text, Ollama Vision, OpenRouter API +- Safety toggles: Dry Run (default ON), Confirm Before Execute, Block Clicks, Block Terminal Typing, STOP hotkey/button +- Loop: screenshot → LLM JSON action → validate → (optionally) execute via `pyautogui` +- Config persisted to `rebuild_v1/config.json` + +## Requirements +- Python 3.11 on Windows +- [Ollama](https://ollama.com) running locally (default host `http://localhost:11434`) +- Optional: OpenRouter API key for Phase B + +Install Python packages: + +```bash +python -m venv .venv +.venv\\Scripts\\activate +pip install -r rebuild_v1/requirements.txt +``` + +## Running the app +```bash +python -m rebuild_v1.main +``` +If Tkinter fails to start in a headless environment, run on a local desktop session. + +## Recommended modes +- **Local Text**: fastest, safest with clicks blocked by default. Ideal when you want conservative hotkeys/typing. +- **Local Vision**: needs a vision model; may be slower on some PCs. Increase timeout (600s+) if the model is large. +- **OpenRouter API**: highest accuracy if you have an API key. + +Pull the default Ollama models before running: +```bash +ollama pull llama3.2:3b +ollama pull llava:7b +``` + +## Using Ollama (Phase A) +1. Start Ollama: `ollama serve` +2. Pick "Ollama (Local Text)" or "Ollama (Local Vision)" in Settings and adjust models/host if needed. +3. Vision mode sends the latest screenshot as base64 to the model. + +The app checks connectivity to Ollama and shows a friendly message if the server or model is unavailable. + +## Safety controls +- **Dry Run**: enabled by default; shows actions without executing. +- **Confirm Before Execute**: prompts before any real action; terminal windows require double-confirm for typing/hotkeys. +- **Block Clicks**: skips click actions and asks the model again. +- **Block Typing In Terminals**: prevents destructive commands unless you confirm twice. +- **STOP hotkey**: `Ctrl+Alt+S` (configurable) registered via the `keyboard` library. +- **STOP button**: halts the current loop. +- **Max steps**, **delay**, **timeout** configurable in Settings. + +## OpenRouter (Phase B) +A provider dropdown exists; if "OpenRouter (API)" is chosen, set your API key, model, and base URL in Settings. Requests use the OpenAI-compatible SDK. + +## Notes +- Screenshots use Pillow's `ImageGrab`; ensure a visible desktop session. +- Avoid sharing `config.json`—it is gitignored and may contain sensitive keys. diff --git a/rebuild_v1/automation.py b/rebuild_v1/automation.py new file mode 100644 index 00000000..964fb3aa --- /dev/null +++ b/rebuild_v1/automation.py @@ -0,0 +1,272 @@ +import threading +import time +from dataclasses import dataclass +from typing import Callable, Dict, Optional, Tuple + +import keyboard +import pyautogui +import tkinter as tk +from PIL import ImageGrab +from tkinter import messagebox + +pyautogui.FAILSAFE = False + + +@dataclass +class ActionResult: + action: Dict[str, object] + executed: bool + error: Optional[str] = None + + +class AutomationEngine: + def __init__( + self, + dry_run: bool = True, + stop_hotkey: str = "ctrl+alt+s", + confirm_before_execute: bool = True, + block_clicks: bool = True, + block_terminal_typing: bool = True, + root: Optional[tk.Tk] = None, + ): + self.dry_run = dry_run + self.stop_hotkey = stop_hotkey + self.confirm_before_execute = confirm_before_execute + self.block_clicks = block_clicks + self.block_terminal_typing = block_terminal_typing + self._stop_flag = threading.Event() + self._hotkey_registered = False + self.log_callback: Callable[[str], None] = lambda _msg: None + self.root = root + self.screen_size: Optional[Tuple[int, int]] = None + + def set_logger(self, logger: Callable[[str], None]) -> None: + self.log_callback = logger + + def update_settings( + self, + *, + dry_run: Optional[bool] = None, + stop_hotkey: Optional[str] = None, + confirm_before_execute: Optional[bool] = None, + block_clicks: Optional[bool] = None, + block_terminal_typing: Optional[bool] = None, + ) -> None: + if dry_run is not None: + self.dry_run = dry_run + if stop_hotkey is not None: + self.stop_hotkey = stop_hotkey + if confirm_before_execute is not None: + self.confirm_before_execute = confirm_before_execute + if block_clicks is not None: + self.block_clicks = block_clicks + if block_terminal_typing is not None: + self.block_terminal_typing = block_terminal_typing + + def register_stop_hotkey(self) -> None: + if self._hotkey_registered: + return + try: + keyboard.add_hotkey(self.stop_hotkey, self.stop) + self._hotkey_registered = True + except keyboard.KeyboardException: + # Keyboard may need elevated privileges; ignore if unavailable + pass + + def stop(self) -> None: + self._stop_flag.set() + + def reset_stop(self) -> None: + self._stop_flag.clear() + + def should_stop(self) -> bool: + return self._stop_flag.is_set() + + def set_screen_size(self, size: Tuple[int, int]) -> None: + self.screen_size = size + + def capture_screenshot(self, save_path: Optional[str] = None): + image = ImageGrab.grab() + path = save_path + if save_path: + image.save(save_path) + return image, path + + def validate_action( + self, action: Dict[str, object], screen_size: Optional[Tuple[int, int]] + ) -> Tuple[bool, Optional[str], bool]: + action_type = action.get("type") + if action_type == "click": + if self.block_clicks: + return False, "Clicks are blocked by safety settings.", True + if screen_size: + width, height = screen_size + x, y = int(action.get("x", -1)), int(action.get("y", -1)) + if x < 0 or y < 0 or x >= width or y >= height: + return False, "Click coordinates out of bounds.", True + return True, None, False + + def _confirm_action(self, description: str, double_confirm: bool = False) -> bool: + if self.root is None: + return True + proceed = messagebox.askyesno("Confirm Action", description) + if not proceed: + return False + if double_confirm: + proceed = messagebox.askyesno( + "Confirm Again", f"Are you absolutely sure? {description}" + ) + return proceed + + def _active_window_title(self) -> str: + try: + win = pyautogui.getActiveWindow() + if win and getattr(win, "title", None): + return str(win.title) + except Exception: # pylint: disable=broad-except + pass + try: + title = pyautogui.getActiveWindowTitle() + if title: + return str(title) + except Exception: # pylint: disable=broad-except + pass + return "" + + def _is_terminal_window(self) -> bool: + title = self._active_window_title().lower() + for keyword in ["powershell", "command prompt", "cmd", "terminal", "bash", "zsh"]: + if keyword in title: + return True + return False + + def execute_action(self, action: Dict[str, object]) -> ActionResult: + if self.dry_run: + return ActionResult(action=action, executed=False, error=None) + + action_type = action.get("type") + description = f"Execute action: {action}" + + if self.confirm_before_execute: + if not self._confirm_action(description): + return ActionResult(action=action, executed=False, error="User cancelled action") + + if action_type in {"type", "hotkey"} and self.block_terminal_typing and self._is_terminal_window(): + if not self._confirm_action( + "Action targets a terminal-like window. Confirm twice to proceed.", double_confirm=True + ): + return ActionResult(action=action, executed=False, error="Blocked in terminal window") + + try: + if action_type == "click": + pyautogui.click(x=int(action["x"]), y=int(action["y"])) + elif action_type == "type": + pyautogui.typewrite(str(action["text"])) + elif action_type == "hotkey": + keys = [str(k) for k in action.get("keys", [])] + pyautogui.hotkey(*keys) + elif action_type == "wait": + time.sleep(float(action.get("seconds", 0))) + elif action_type == "done": + pass + else: + return ActionResult(action=action, executed=False, error="Unknown action type") + return ActionResult(action=action, executed=True, error=None) + except Exception as exc: # pylint: disable=broad-except + return ActionResult(action=action, executed=False, error=str(exc)) + + +class ActionLooper: + def __init__( + self, + automation: AutomationEngine, + request_action: Callable[..., str], + parse_action: Callable[[str, Optional[Callable[[str], None]]], Optional[Dict[str, object]]], + max_steps: int, + delay_seconds: float, + log_callback: Callable[[str], None], + ): + self.automation = automation + self.request_action = request_action + self.parse_action = parse_action + self.max_steps = max_steps + self.delay_seconds = delay_seconds + self.log_callback = log_callback + + def run(self, objective: str) -> str: + self.automation.reset_stop() + self.automation.register_stop_hotkey() + screen = pyautogui.size() + self.automation.set_screen_size((screen.width, screen.height)) + self.log_callback(f"Screen size detected: {screen.width}x{screen.height}") + + for step in range(1, self.max_steps + 1): + if self.automation.should_stop(): + return "Stopped by user" + + try: + image, _ = self.automation.capture_screenshot() + screenshot_note = f"screenshot captured ({image.width}x{image.height})" + self.log_callback(f"Screenshot captured for step {step}: {image.width}x{image.height}") + except Exception as exc: # pylint: disable=broad-except + screenshot_note = f"screenshot failed: {exc}" + self.log_callback(f"Screenshot failed: {exc}") + image = None + + screenshot_bytes = None + if image: + try: + from io import BytesIO + + buffer = BytesIO() + image.save(buffer, format="PNG") + screenshot_bytes = buffer.getvalue() + except Exception as exc: # pylint: disable=broad-except + self.log_callback(f"Failed to serialize screenshot: {exc}") + + retries = 0 + action_data: Optional[Dict[str, object]] = None + reask_guard = 0 + while retries < 3 and not self.automation.should_stop(): + raw = self.request_action( + objective, + screenshot_note=screenshot_note, + screenshot_bytes=screenshot_bytes, + screen_size=f"{screen.width}x{screen.height}", + retry_count=retries, + ) + self.log_callback(f"Raw model response: {raw}") + action_data = self.parse_action(raw, self.log_callback) + if action_data is None: + retries += 1 + self.log_callback("Model returned invalid JSON. Retrying...") + continue + + valid, reason, reask = self.automation.validate_action(action_data, (screen.width, screen.height)) + if not valid: + self.log_callback(reason or "Action rejected") + if reask: + reask_guard += 1 + if reask_guard >= 3: + return "Action rejected repeatedly" + continue + break + + if action_data is None: + return "Failed to parse action after retries" + + result = self.automation.execute_action(action_data) + executed_text = "executed" if result.executed else "dry-run" + self.log_callback(f"Action step {step}: {action_data} ({executed_text})") + if result.error: + self.log_callback(f"Action error: {result.error}") + if result.error.startswith("User cancelled"): + return "User cancelled action" + + if action_data.get("type") == "done": + return str(action_data.get("reason", "Done")) + + if self.automation.should_stop(): + return "Stopped by user" + time.sleep(self.delay_seconds) + return "Reached max steps" diff --git a/rebuild_v1/config.py b/rebuild_v1/config.py new file mode 100644 index 00000000..480b9eb6 --- /dev/null +++ b/rebuild_v1/config.py @@ -0,0 +1,46 @@ +import json +from pathlib import Path +from typing import Any, Dict + +CONFIG_PATH = Path(__file__).parent / "config.json" + +DEFAULT_CONFIG: Dict[str, Any] = { + "provider_mode": "Ollama (Local Text)", + "ollama_host": "http://localhost:11434", + "ollama_text_model": "llama3.2:3b", + "ollama_vision_model": "llava:7b", + "openrouter_api_key": "", + "openrouter_model": "openrouter/auto", + "openrouter_base_url": "https://openrouter.ai/api/v1", + "llm_timeout_seconds": 600, + "max_steps": 20, + "delay_seconds": 0.6, + "stop_hotkey": "ctrl+alt+s", + "dry_run": True, + "confirm_before_execute": True, + "block_clicks": True, + "block_terminal_typing": True, +} + + +def load_config() -> Dict[str, Any]: + if CONFIG_PATH.exists(): + try: + with CONFIG_PATH.open("r", encoding="utf-8") as f: + data = json.load(f) + merged = DEFAULT_CONFIG.copy() + if isinstance(data, dict): + merged.update(data) + return merged + except (json.JSONDecodeError, OSError, TypeError): + return DEFAULT_CONFIG.copy() + return DEFAULT_CONFIG.copy() + + +def save_config(config: Dict[str, Any]) -> None: + try: + with CONFIG_PATH.open("w", encoding="utf-8") as f: + json.dump(config, f, indent=2) + except OSError: + # Prefer silent failure over crashing the UI when filesystem is read-only + pass diff --git a/rebuild_v1/engine_llm.py b/rebuild_v1/engine_llm.py new file mode 100644 index 00000000..0aa450da --- /dev/null +++ b/rebuild_v1/engine_llm.py @@ -0,0 +1,363 @@ +import base64 +import io +import json +from typing import Callable, Dict, List, Optional + +import requests +from openai import OpenAI + + +class LLMError(Exception): + """Raised when the language model call fails.""" + + +def _clean_json_response(text: str) -> str: + cleaned = text.strip() + fence_start = cleaned.find("```") + if fence_start != -1: + fence_end = cleaned.find("```", fence_start + 3) + if fence_end != -1: + cleaned = cleaned[fence_start + 3 : fence_end] + cleaned = cleaned.strip() + if cleaned.lower().startswith("json"): + cleaned = cleaned[4:].strip() + + brace_start = cleaned.find("{") + if brace_start != -1: + brace_depth = 0 + end_idx = None + for idx, ch in enumerate(cleaned[brace_start:], start=brace_start): + if ch == "{": + brace_depth += 1 + elif ch == "}": + brace_depth -= 1 + if brace_depth == 0: + end_idx = idx + break + if end_idx is not None: + cleaned = cleaned[brace_start : end_idx + 1] + else: + cleaned = cleaned[brace_start:] + return cleaned + + +def parse_action_text(text: str, logger: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, object]]: + """Parse first JSON object describing an action with markdown-tolerant cleaning.""" + cleaned = _clean_json_response(text) + try: + data = json.loads(cleaned) + except json.JSONDecodeError as exc: + if logger: + logger(f"Failed to parse model response (raw first 500): {text[:500]}") + logger(f"Cleaned string: {cleaned}") + logger(f"Parse error: {repr(exc)}") + return None + + if not isinstance(data, dict): + return None + action_type = data.get("type") + if action_type not in {"click", "type", "hotkey", "wait", "done"}: + return None + if action_type == "click": + if isinstance(data.get("x"), int) and isinstance(data.get("y"), int): + return {"type": "click", "x": data["x"], "y": data["y"]} + elif action_type == "type": + if isinstance(data.get("text"), str): + return {"type": "type", "text": data["text"]} + elif action_type == "hotkey": + keys = data.get("keys") + if isinstance(keys, str): + keys = [keys] + if isinstance(keys, list): + normalized = [] + for key in keys: + if isinstance(key, str) and "+" in key: + normalized.extend(part.strip() for part in key.split("+") if part.strip()) + elif isinstance(key, str): + normalized.append(key) + if normalized: + return {"type": "hotkey", "keys": normalized} + elif action_type == "wait": + seconds = data.get("seconds") + if isinstance(seconds, (int, float)): + return {"type": "wait", "seconds": float(seconds)} + elif action_type == "done": + reason = data.get("reason") + if isinstance(reason, str): + return {"type": "done", "reason": reason} + return None + + +class LLMEngine: + def __init__(self, config: Dict[str, object], logger: Optional[Callable[[str], None]] = None): + self.config = config + self.logger = logger or (lambda _msg: None) + + def _raise(self, provider_mode: str, endpoint: str, model: str, exc: Exception) -> LLMError: + return LLMError( + f"[{provider_mode}] call to {endpoint} with model '{model}' failed: {repr(exc)}" + ) + + def _ollama_chat( + self, + messages: List[Dict[str, object]], + model: str, + host: str, + timeout: float, + provider_mode: str, + ) -> str: + try: + health = requests.get(f"{host}/api/tags", timeout=timeout) + except requests.ReadTimeout as exc: # type: ignore[attr-defined] + raise LLMError( + "Ollama timed out: increase timeout or use a smaller model. " + f"Details: {repr(exc)}" + ) from exc + except requests.RequestException as exc: + raise self._raise(provider_mode, f"{host}/api/tags", model, exc) from exc + + if health.status_code != 200: + raise LLMError( + f"[{provider_mode}] Ollama responded with status {health.status_code}; is it running?" + ) + + payload = {"model": model, "messages": messages, "stream": False} + try: + response = requests.post( + f"{host}/api/chat", json=payload, timeout=timeout, stream=False + ) + except requests.ReadTimeout as exc: # type: ignore[attr-defined] + raise LLMError( + "Ollama timed out: increase timeout or use a smaller model. " + f"Details: {repr(exc)}" + ) from exc + except requests.RequestException as exc: + raise self._raise(provider_mode, f"{host}/api/chat", model, exc) from exc + + if response.status_code == 404: + raise LLMError( + f"[{provider_mode}] Model '{model}' not found. Install with: ollama pull {model}" + ) + if response.status_code >= 400: + raise LLMError( + f"[{provider_mode}] Ollama returned {response.status_code}: {response.text}" + ) + + try: + data = response.json() + except ValueError as exc: # pragma: no cover - defensive + raise self._raise(provider_mode, f"{host}/api/chat", model, exc) from exc + + message = data.get("message", {}) + content = message.get("content") + if not content: + raise LLMError(f"[{provider_mode}] Ollama returned an empty response.") + return content + + def _openrouter_chat( + self, messages: List[Dict[str, object]], model: str, base_url: str, api_key: str, timeout: float + ) -> str: + if not api_key: + raise LLMError("[OpenRouter (API)] Missing API key in settings.") + + client = OpenAI(api_key=api_key, base_url=base_url) + try: + chat = client.chat.completions.create( + model=model, + messages=messages, + stream=False, + timeout=timeout, + ) + except Exception as exc: # pylint: disable=broad-except + raise LLMError( + f"[OpenRouter (API)] call to {base_url} with model '{model}' failed: {repr(exc)}" + ) from exc + + try: + content = chat.choices[0].message.content + except (AttributeError, IndexError): # pragma: no cover - defensive + content = None + if not content: + raise LLMError("[OpenRouter (API)] returned an empty response.") + return content + + def request_action( + self, + objective: str, + screenshot_note: str, + screenshot_bytes: Optional[bytes] = None, + screen_size: Optional[str] = None, + retry_count: int = 0, + ) -> str: + provider_mode = str(self.config.get("provider_mode", "Ollama (Local Text)")) + timeout = float(self.config.get("llm_timeout_seconds", 600)) + common_schema = ( + "Allowed actions as ONE JSON object: " + '{"type":"click","x":int,"y":int} | ' + '{"type":"type","text":str} | ' + '{"type":"hotkey","keys":[str,...]} | ' + '{"type":"wait","seconds":float} | ' + '{"type":"done","reason":str}.' + ) + examples = ( + "Examples: {\"type\":\"hotkey\",\"keys\":[\"win\"]} | " + "{\"type\":\"type\",\"text\":\"notepad\"} | " + "{\"type\":\"click\",\"x\":120,\"y\":220} | " + "{\"type\":\"wait\",\"seconds\":1.0} | " + "{\"type\":\"done\",\"reason\":\"finished\"}." + ) + json_only_rules = ( + "Output EXACTLY ONE JSON object. DO NOT wrap output in markdown or backticks. " + "DO NOT include explanations or extra text. Markdown-wrapped JSON will be rejected. " + "Output raw JSON only." + ) + safe_text_rules = ( + "In text-only mode avoid free clicking. Prefer hotkeys, typing, waits, Win search, " + "Ctrl+L for address bar, then type and Enter. Output raw JSON only." + ) + vision_rules = ( + "You may click visible elements, but still prefer reliable hotkeys and typing when possible. " + "Output raw JSON only." + ) + screen_info = ( + f"The virtual screen resolution is {screen_size or 'unknown'} pixels. " + "Coordinates must be within this range. Do not assume a single monitor." + ) + retry_warning = ( + "Previous response had invalid JSON. Respond with raw JSON only, no markdown." + if retry_count > 0 + else "" + ) + user_block = ( + f"Objective: {objective}\n" + f"Latest screenshot info: {screenshot_note}\n" + f"Screen size: {screen_size or 'unknown'}\n" + f"{screen_info}\n" + "Allowed actions must use integers for x and y. Prefer hotkey/type/wait over click; " + "only click when necessary. Reply with exactly one JSON object." + ) + + messages: List[Dict[str, object]] = [ + { + "role": "system", + "content": ( + "You are controlling a computer. Decide the single next action. " + + common_schema + + " " + + examples + + " " + + json_only_rules + ), + } + ] + + if provider_mode == "Ollama (Local Text)": + messages.append({"role": "user", "content": safe_text_rules}) + if retry_warning: + messages.append({"role": "user", "content": retry_warning}) + messages.append({"role": "user", "content": user_block}) + return self._ollama_chat( + messages, + model=str(self.config.get("ollama_text_model", "llama3.2:3b")), + host=str(self.config.get("ollama_host", "http://localhost:11434")), + timeout=timeout, + provider_mode=provider_mode, + ) + + if provider_mode == "Ollama (Local Vision)": + if screenshot_bytes is None: + raise LLMError("Vision mode requires screenshot bytes.") + b64_image = base64.b64encode(screenshot_bytes).decode("utf-8") + messages.append({"role": "user", "content": vision_rules}) + if retry_warning: + messages.append({"role": "user", "content": retry_warning}) + messages.append( + { + "role": "user", + "content": user_block, + "images": [b64_image], + } + ) + try: + return self._ollama_chat( + messages, + model=str(self.config.get("ollama_vision_model", "llava:7b")), + host=str(self.config.get("ollama_host", "http://localhost:11434")), + timeout=timeout, + provider_mode=provider_mode, + ) + except LLMError as exc: + if "images" in str(exc).lower(): + self.logger( + "The selected Ollama model may not support images. Try pulling llava:7b and set it in settings." + ) + raise + + if provider_mode == "OpenRouter (API)": + messages.append({"role": "user", "content": safe_text_rules}) + if retry_warning: + messages.append({"role": "user", "content": retry_warning}) + messages.append({"role": "user", "content": user_block}) + return self._openrouter_chat( + messages, + model=str(self.config.get("openrouter_model", "openrouter/auto")), + base_url=str(self.config.get("openrouter_base_url", "https://openrouter.ai/api/v1")), + api_key=str(self.config.get("openrouter_api_key", "")), + timeout=timeout, + ) + + raise LLMError(f"Unsupported provider mode: {provider_mode}") + + +def test_provider(config: Dict[str, object]) -> str: + engine = LLMEngine(config) + provider_mode = str(config.get("provider_mode", "Ollama (Local Text)")) + timeout = float(config.get("llm_timeout_seconds", 600)) + if provider_mode.startswith("Ollama"): + host = str(config.get("ollama_host", "http://localhost:11434")) + model = ( + str(config.get("ollama_text_model", "llama3.2:3b")) + if provider_mode == "Ollama (Local Text)" + else str(config.get("ollama_vision_model", "llava:7b")) + ) + try: + tags = requests.get(f"{host}/api/tags", timeout=timeout) + tags.raise_for_status() + except Exception as exc: # pylint: disable=broad-except + return f"Ollama connectivity failed: {repr(exc)}" + + payload_messages: List[Dict[str, object]] = [ + { + "role": "user", + "content": "Reply OK", + } + ] + if provider_mode == "Ollama (Local Vision)": + img = io.BytesIO() + from PIL import Image # lazy import to avoid overhead if unused + + Image.new("RGB", (2, 2), color="black").save(img, format="PNG") + img_bytes = img.getvalue() + payload_messages[0]["images"] = [base64.b64encode(img_bytes).decode("utf-8")] + try: + reply = engine._ollama_chat( # pylint: disable=protected-access + payload_messages, model=model, host=host, timeout=timeout, provider_mode=provider_mode + ) + except Exception as exc: # pylint: disable=broad-except + return f"Ollama chat failed: {repr(exc)}" + return f"Ollama test succeeded: {reply}" if reply else "Ollama test returned empty response." + + if provider_mode == "OpenRouter (API)": + try: + reply = engine._openrouter_chat( # pylint: disable=protected-access + messages=[{"role": "user", "content": "Reply OK"}], + model=str(config.get("openrouter_model", "openrouter/auto")), + base_url=str(config.get("openrouter_base_url", "https://openrouter.ai/api/v1")), + api_key=str(config.get("openrouter_api_key", "")), + timeout=timeout, + ) + except Exception as exc: # pylint: disable=broad-except + return f"OpenRouter test failed: {repr(exc)}" + return f"OpenRouter test succeeded: {reply}" if reply else "OpenRouter test returned empty response." + + return f"Unsupported provider mode: {provider_mode}" diff --git a/rebuild_v1/gui.py b/rebuild_v1/gui.py new file mode 100644 index 00000000..076eba6f --- /dev/null +++ b/rebuild_v1/gui.py @@ -0,0 +1,301 @@ +import threading +import tkinter as tk +from tkinter import ttk +from typing import Callable, Dict, List + +from config import load_config, save_config + + +class AppGUI: + def __init__( + self, + root: tk.Tk, + run_callback: Callable[[List[str]], None], + stop_callback: Callable[[], None], + log_export: Callable[[Callable[[str], None]], None], + test_llm_callback: Callable[[], None], + test_screenshot_callback: Callable[[], None], + ): + self.root = root + self.run_callback = run_callback + self.stop_callback = stop_callback + self.log_export = log_export + self.test_llm_callback = test_llm_callback + self.test_screenshot_callback = test_screenshot_callback + self.config = load_config() + self.tasks: List[str] = [] + + root.title("Self-Operating Computer v1") + root.geometry("900x620") + + notebook = ttk.Notebook(root) + notebook.pack(fill=tk.BOTH, expand=True) + + self.log_text = tk.Text(root, wrap="word", height=10, state=tk.DISABLED) + + self._build_tasks_tab(notebook) + self._build_settings_tab(notebook) + self._build_logs_tab(notebook) + + self.log_export(self.log) + + def _build_tasks_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Tasks") + + ttk.Label(frame, text="Objective").pack(anchor=tk.W, padx=8, pady=(8, 2)) + self.objective_input = tk.Text(frame, height=4) + self.objective_input.pack(fill=tk.X, padx=8) + + button_frame = ttk.Frame(frame) + button_frame.pack(fill=tk.X, padx=8, pady=6) + + ttk.Button(button_frame, text="Add Task", command=self.add_task).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Run", command=self.run_tasks).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Stop", command=self.stop_callback).pack( + side=tk.LEFT, padx=4 + ) + ttk.Button(button_frame, text="Clear", command=self.clear_tasks).pack( + side=tk.LEFT, padx=4 + ) + + safety_frame = ttk.LabelFrame(frame, text="Safety") + safety_frame.pack(fill=tk.X, padx=8, pady=6) + + self.dry_run_var = tk.BooleanVar(value=bool(self.config.get("dry_run", True))) + ttk.Checkbutton( + safety_frame, + text="Dry Run (do not execute actions)", + variable=self.dry_run_var, + command=self._persist_dry_run, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.confirm_var = tk.BooleanVar( + value=bool(self.config.get("confirm_before_execute", True)) + ) + ttk.Checkbutton( + safety_frame, + text="Confirm Before Execute", + variable=self.confirm_var, + command=self._persist_confirm, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.block_clicks_var = tk.BooleanVar(value=bool(self.config.get("block_clicks", True))) + ttk.Checkbutton( + safety_frame, + text="Block Click Actions", + variable=self.block_clicks_var, + command=self._persist_block_clicks, + ).pack(anchor=tk.W, padx=6, pady=2) + + self.block_terminal_var = tk.BooleanVar( + value=bool(self.config.get("block_terminal_typing", True)) + ) + ttk.Checkbutton( + safety_frame, + text="Block Typing In Terminals", + variable=self.block_terminal_var, + command=self._persist_block_terminal, + ).pack(anchor=tk.W, padx=6, pady=2) + + ttk.Label(frame, text="Task Queue").pack(anchor=tk.W, padx=8, pady=(10, 2)) + self.tasks_list = tk.Listbox(frame, height=10) + self.tasks_list.pack(fill=tk.BOTH, expand=True, padx=8, pady=(0, 8)) + + def _build_settings_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Settings") + + provider_frame = ttk.LabelFrame(frame, text="Provider") + provider_frame.pack(fill=tk.X, padx=8, pady=6) + ttk.Label(provider_frame, text="Provider Mode").pack(anchor=tk.W) + self.provider_var = tk.StringVar( + value=str(self.config.get("provider_mode", "Ollama (Local Text)")) + ) + provider_menu = ttk.Combobox( + provider_frame, + textvariable=self.provider_var, + values=[ + "Ollama (Local Text)", + "Ollama (Local Vision)", + "OpenRouter (API)", + ], + state="readonly", + ) + provider_menu.pack(fill=tk.X, pady=2) + provider_menu.bind("<>", lambda _event: self._toggle_provider_fields()) + + self.ollama_host_var = tk.StringVar(value=str(self.config.get("ollama_host"))) + self.ollama_text_model_var = tk.StringVar( + value=str(self.config.get("ollama_text_model")) + ) + self.ollama_vision_model_var = tk.StringVar( + value=str(self.config.get("ollama_vision_model")) + ) + self.openrouter_key_var = tk.StringVar(value=str(self.config.get("openrouter_api_key"))) + self.openrouter_model_var = tk.StringVar(value=str(self.config.get("openrouter_model"))) + self.openrouter_base_var = tk.StringVar( + value=str(self.config.get("openrouter_base_url")) + ) + + self.provider_container = ttk.Frame(provider_frame) + self.provider_container.pack(fill=tk.X, padx=4, pady=4) + self._build_provider_fields() + + extras = ttk.LabelFrame(frame, text="Run Settings") + extras.pack(fill=tk.X, padx=8, pady=6) + + ttk.Label(extras, text="LLM Timeout (seconds)").pack(anchor=tk.W) + self.timeout_var = tk.IntVar(value=int(self.config.get("llm_timeout_seconds", 600))) + ttk.Entry(extras, textvariable=self.timeout_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Max steps").pack(anchor=tk.W, pady=(6, 0)) + self.max_steps_var = tk.IntVar(value=int(self.config.get("max_steps", 20))) + ttk.Entry(extras, textvariable=self.max_steps_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Delay between actions (seconds)").pack(anchor=tk.W, pady=(6, 0)) + self.delay_var = tk.DoubleVar(value=float(self.config.get("delay_seconds", 0.6))) + ttk.Entry(extras, textvariable=self.delay_var).pack(fill=tk.X, pady=2) + + ttk.Label(extras, text="Stop hotkey (e.g., ctrl+alt+s)").pack(anchor=tk.W, pady=(6, 0)) + self.stop_hotkey_var = tk.StringVar( + value=str(self.config.get("stop_hotkey", "ctrl+alt+s")) + ) + ttk.Entry(extras, textvariable=self.stop_hotkey_var).pack(fill=tk.X, pady=2) + + button_bar = ttk.Frame(frame) + button_bar.pack(fill=tk.X, padx=8, pady=10) + ttk.Button(button_bar, text="Save Settings", command=self.save_settings).pack( + side=tk.RIGHT, padx=4 + ) + ttk.Button(button_bar, text="Test LLM", command=self._test_llm).pack( + side=tk.RIGHT, padx=4 + ) + ttk.Button(button_bar, text="Test Screenshot", command=self._test_screenshot).pack( + side=tk.RIGHT, padx=4 + ) + + def _build_provider_fields(self) -> None: + for child in list(self.provider_container.winfo_children()): + child.destroy() + + provider = self.provider_var.get() + if provider.startswith("Ollama"): + ttk.Label(self.provider_container, text="Ollama Host").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.ollama_host_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="Ollama Text Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.ollama_text_model_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="Ollama Vision Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.ollama_vision_model_var).pack( + fill=tk.X, pady=2 + ) + else: + ttk.Label(self.provider_container, text="OpenRouter API Key").pack(anchor=tk.W) + ttk.Entry(self.provider_container, textvariable=self.openrouter_key_var, show="*").pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Model").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_model_var).pack( + fill=tk.X, pady=2 + ) + ttk.Label(self.provider_container, text="OpenRouter Base URL").pack(anchor=tk.W, pady=(6, 0)) + ttk.Entry(self.provider_container, textvariable=self.openrouter_base_var).pack( + fill=tk.X, pady=2 + ) + + def _build_logs_tab(self, notebook: ttk.Notebook) -> None: + frame = ttk.Frame(notebook) + notebook.add(frame, text="Logs") + scrollbar = ttk.Scrollbar(frame) + scrollbar.pack(side=tk.RIGHT, fill=tk.Y) + self.log_text = tk.Text(frame, wrap="word", state=tk.DISABLED) + self.log_text.pack(fill=tk.BOTH, expand=True) + self.log_text.config(yscrollcommand=scrollbar.set) + scrollbar.config(command=self.log_text.yview) + + def add_task(self) -> None: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + self.objective_input.delete("1.0", tk.END) + + def clear_tasks(self) -> None: + self.tasks.clear() + self.tasks_list.delete(0, tk.END) + + def run_tasks(self) -> None: + if not self.tasks: + text = self.objective_input.get("1.0", tk.END).strip() + if text: + self.tasks.append(text) + self.tasks_list.insert(tk.END, text) + if not self.tasks: + self.log("No tasks to run.") + return + threading.Thread(target=self.run_callback, args=(self.tasks.copy(),), daemon=True).start() + + def _persist_dry_run(self) -> None: + self.config["dry_run"] = self.dry_run_var.get() + save_config(self.config) + + def _persist_confirm(self) -> None: + self.config["confirm_before_execute"] = self.confirm_var.get() + save_config(self.config) + + def _persist_block_clicks(self) -> None: + self.config["block_clicks"] = self.block_clicks_var.get() + save_config(self.config) + + def _persist_block_terminal(self) -> None: + self.config["block_terminal_typing"] = self.block_terminal_var.get() + save_config(self.config) + + def save_settings(self) -> None: + self.config.update( + { + "provider_mode": self.provider_var.get(), + "ollama_host": self.ollama_host_var.get(), + "ollama_text_model": self.ollama_text_model_var.get(), + "ollama_vision_model": self.ollama_vision_model_var.get(), + "openrouter_api_key": self.openrouter_key_var.get(), + "openrouter_model": self.openrouter_model_var.get(), + "openrouter_base_url": self.openrouter_base_var.get(), + "llm_timeout_seconds": self.timeout_var.get(), + "max_steps": self.max_steps_var.get(), + "delay_seconds": self.delay_var.get(), + "stop_hotkey": self.stop_hotkey_var.get(), + "dry_run": self.dry_run_var.get(), + "confirm_before_execute": self.confirm_var.get(), + "block_clicks": self.block_clicks_var.get(), + "block_terminal_typing": self.block_terminal_var.get(), + } + ) + save_config(self.config) + self.log("Settings saved.") + self._build_provider_fields() + + def _toggle_provider_fields(self) -> None: + self.save_settings() + self._build_provider_fields() + + def _test_llm(self) -> None: + self.save_settings() + self.test_llm_callback() + + def _test_screenshot(self) -> None: + self.test_screenshot_callback() + + def log(self, message: str) -> None: + self.log_text.configure(state=tk.NORMAL) + self.log_text.insert(tk.END, message + "\n") + self.log_text.see(tk.END) + self.log_text.configure(state=tk.DISABLED) diff --git a/rebuild_v1/main.py b/rebuild_v1/main.py new file mode 100644 index 00000000..44018abe --- /dev/null +++ b/rebuild_v1/main.py @@ -0,0 +1,114 @@ +import tempfile +import tkinter as tk +from typing import Callable, List + +from automation import ActionLooper, AutomationEngine +from config import load_config +from engine_llm import LLMEngine, LLMError, parse_action_text, test_provider +from gui import AppGUI + + +class AppController: + def __init__(self, root: tk.Tk): + self.root = root + self.logger: Callable[[str], None] = lambda msg: None + self.config = load_config() + self.automation = AutomationEngine( + dry_run=bool(self.config.get("dry_run", True)), + stop_hotkey=str(self.config.get("stop_hotkey", "ctrl+alt+s")), + confirm_before_execute=bool(self.config.get("confirm_before_execute", True)), + block_clicks=bool(self.config.get("block_clicks", True)), + block_terminal_typing=bool(self.config.get("block_terminal_typing", True)), + root=root, + ) + self.gui = AppGUI( + root, + self.run_tasks, + self.stop, + self.export_logger, + self.test_llm, + self.test_screenshot, + ) + + def export_logger(self, logger: Callable[[str], None]) -> None: + self.logger = logger + self.automation.set_logger(logger) + + def log(self, message: str) -> None: + self.logger(message) + + def stop(self) -> None: + self.automation.stop() + self.log("Stop signal sent.") + + def _log_provider_settings(self) -> None: + mode = str(self.config.get("provider_mode")) + timeout = self.config.get("llm_timeout_seconds") + if mode == "Ollama (Local Text)": + model = self.config.get("ollama_text_model") + host = self.config.get("ollama_host") + elif mode == "Ollama (Local Vision)": + model = self.config.get("ollama_vision_model") + host = self.config.get("ollama_host") + else: + model = self.config.get("openrouter_model") + host = self.config.get("openrouter_base_url") + self.log(f"Provider: {mode} | Model: {model} | Host: {host} | Timeout: {timeout}s") + + def run_tasks(self, tasks: List[str]) -> None: + for index, objective in enumerate(tasks, start=1): + self.config = load_config() + self.automation.update_settings( + dry_run=bool(self.config.get("dry_run", True)), + stop_hotkey=str(self.config.get("stop_hotkey", "ctrl+alt+s")), + confirm_before_execute=bool(self.config.get("confirm_before_execute", True)), + block_clicks=bool(self.config.get("block_clicks", True)), + block_terminal_typing=bool(self.config.get("block_terminal_typing", True)), + ) + llm = LLMEngine(self.config, logger=self.log) + looper = ActionLooper( + automation=self.automation, + request_action=llm.request_action, + parse_action=parse_action_text, + max_steps=int(self.config.get("max_steps", 20)), + delay_seconds=float(self.config.get("delay_seconds", 0.6)), + log_callback=self.log, + ) + self.log(f"Running task {index}/{len(tasks)}: {objective}") + self._log_provider_settings() + try: + outcome = looper.run(objective) + except LLMError as exc: + self.log(f"LLM error: {repr(exc)}") + break + except Exception as exc: # pylint: disable=broad-except + self.log(f"Unexpected error: {repr(exc)}") + break + self.log(f"Task result: {outcome}") + if self.automation.should_stop(): + self.log("Stopped before finishing all tasks.") + break + + def test_llm(self) -> None: + self.config = load_config() + result = test_provider(self.config) + self.log(result) + + def test_screenshot(self) -> None: + try: + with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp: + image, path = self.automation.capture_screenshot(save_path=tmp.name) + size = f"{image.width}x{image.height}" if image else "unknown" + self.log(f"Screenshot saved to {path} with size {size}") + except Exception as exc: # pylint: disable=broad-except + self.log(f"Screenshot test failed: {repr(exc)}") + + +def main() -> None: + root = tk.Tk() + AppController(root) + root.mainloop() + + +if __name__ == "__main__": + main() diff --git a/rebuild_v1/requirements.txt b/rebuild_v1/requirements.txt new file mode 100644 index 00000000..2aa9e07e --- /dev/null +++ b/rebuild_v1/requirements.txt @@ -0,0 +1,6 @@ +openai +requests +python-dotenv +pillow +pyautogui +keyboard diff --git a/rebuild_v1/test_parse.py b/rebuild_v1/test_parse.py new file mode 100644 index 00000000..6843df91 --- /dev/null +++ b/rebuild_v1/test_parse.py @@ -0,0 +1,23 @@ +"""Minimal parser validation for JSON cleaning.""" +from engine_llm import parse_action_text + + +def run_case(name: str, text: str) -> None: + result = parse_action_text(text) + status = "PASS" if result is not None else "FAIL" + print(f"{name}: {status} -> {result}") + + +def main() -> None: + wrapped = """ +```json +{"type": "wait", "seconds": 1} +``` +""" + noisy = "Some intro {\n \"type\": \"done\", \"reason\": \"ok\"\n}\n trailing" + run_case("Wrapped JSON", wrapped) + run_case("Noisy text", noisy) + + +if __name__ == "__main__": + main()