OthersideAI · alikumale · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/rebuild_v1/.env.example b/rebuild_v1/.env.example
@@ -0,0 +1,2 @@
+# Optional: set your OpenRouter API key for Phase B
+OPENROUTER_API_KEY=
diff --git a/rebuild_v1/.gitignore b/rebuild_v1/.gitignore
@@ -0,0 +1,5 @@
+config.json
+.env
+.venv
+__pycache__/
+*.pyc
diff --git a/rebuild_v1/README.md b/rebuild_v1/README.md
@@ -0,0 +1,48 @@
+# Self-Operating Computer v1 (clean rebuild)
+
+A minimal Windows-friendly rewrite that runs a decide-and-act loop with a local Ollama model (no API keys required). Phase B wiring for OpenRouter is also in place.
+
+## Features
+- Tkinter GUI with Tasks, Settings, and Logs tabs
+- Dry Run safety (on by default) and STOP hotkey/button
+- Loop: screenshot → LLM JSON action → validate → (optionally) execute via `pyautogui`
+- Config persisted to `rebuild_v1/config.json`
+
+## Requirements
+- Python 3.11 on Windows
+- [Ollama](https://ollama.com) running locally (default host `http://localhost:11434`)
+- Optional: OpenRouter API key for Phase B
+
+Install Python packages:
+
+```bash
+python -m venv .venv
+.venv\\Scripts\\activate
+pip install -r rebuild_v1/requirements.txt
+```
+
+## Running the app
+```bash
+python -m rebuild_v1.main
+```
+If Tkinter fails to start in a headless environment, run on a local desktop session.
+
+## Using Ollama (Phase A)
+1. Start Ollama: `ollama serve`
+2. Pull the default model once: `ollama pull llama3.2:3b`
+3. Ensure the host and model fields in **Settings** match your setup.
+
+The app checks connectivity to Ollama and shows a friendly message if the server or model is unavailable.
+
+## Safety controls
+- **Dry Run**: enabled by default; shows actions without executing.
+- **STOP hotkey**: `Ctrl+Alt+S` (configurable) registered via the `keyboard` library.
+- **STOP button**: halts the current loop.
+- **Max steps** and **delay** configurable in Settings.
+
+## OpenRouter (Phase B)
+A provider dropdown exists; if "OpenRouter (API)" is chosen, set your API key, model, and base URL in Settings. Requests use the OpenAI-compatible SDK.
+
+## Notes
+- Screenshots use Pillow's `ImageGrab`; ensure a visible desktop session.
+- Avoid sharing `config.json`—it is gitignored and may contain sensitive keys.
diff --git a/rebuild_v1/automation.py b/rebuild_v1/automation.py
@@ -0,0 +1,133 @@
+import threading
+import time
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional
+
+import keyboard
+import pyautogui
+from PIL import ImageGrab
+
+pyautogui.FAILSAFE = False
+
+
+@dataclass
+class ActionResult:
+    action: Dict[str, object]
+    executed: bool
+    error: Optional[str] = None
+
+
+class AutomationEngine:
+    def __init__(self, dry_run: bool = True, stop_hotkey: str = "ctrl+alt+s"):
+        self.dry_run = dry_run
+        self.stop_hotkey = stop_hotkey
+        self._stop_flag = threading.Event()
+        self._hotkey_registered = False
+
+    def register_stop_hotkey(self) -> None:
+        if self._hotkey_registered:
+            return
+        try:
+            keyboard.add_hotkey(self.stop_hotkey, self.stop)
+            self._hotkey_registered = True
+        except keyboard.KeyboardException:
+            # Keyboard may need elevated privileges; ignore if unavailable
+            pass
+
+    def stop(self) -> None:
+        self._stop_flag.set()
+
+    def reset_stop(self) -> None:
+        self._stop_flag.clear()
+
+    def should_stop(self) -> bool:
+        return self._stop_flag.is_set()
+
+    def capture_screenshot(self, save_path: Optional[str] = None):
+        image = ImageGrab.grab()
+        if save_path:
+            image.save(save_path)
+        return image
+
+    def execute_action(self, action: Dict[str, object]) -> ActionResult:
+        if self.dry_run:
+            return ActionResult(action=action, executed=False, error=None)
+
+        try:
+            action_type = action.get("type")
+            if action_type == "click":
+                pyautogui.click(x=int(action["x"]), y=int(action["y"]))
+            elif action_type == "type":
+                pyautogui.typewrite(str(action["text"]))
+            elif action_type == "hotkey":
+                keys = [str(k) for k in action.get("keys", [])]
+                pyautogui.hotkey(*keys)
+            elif action_type == "wait":
+                time.sleep(float(action.get("seconds", 0)))
+            elif action_type == "done":
+                # No-op
+                pass
+            else:
+                return ActionResult(action=action, executed=False, error="Unknown action type")
+            return ActionResult(action=action, executed=True, error=None)
+        except Exception as exc:  # pylint: disable=broad-except
+            return ActionResult(action=action, executed=False, error=str(exc))
+
+
+class ActionLooper:
+    def __init__(
+        self,
+        automation: AutomationEngine,
+        request_action: Callable[[str, str], str],
+        parse_action: Callable[[str], Optional[Dict[str, object]]],
+        max_steps: int,
+        delay_seconds: float,
+        log_callback: Callable[[str], None],
+    ):
+        self.automation = automation
+        self.request_action = request_action
+        self.parse_action = parse_action
+        self.max_steps = max_steps
+        self.delay_seconds = delay_seconds
+        self.log_callback = log_callback
+
+    def run(self, objective: str) -> str:
+        self.automation.reset_stop()
+        self.automation.register_stop_hotkey()
+        last_note = "Screenshot captured"
+        for step in range(1, self.max_steps + 1):
+            if self.automation.should_stop():
+                return "Stopped by user"
+            try:
+                self.automation.capture_screenshot()
+            except Exception as exc:  # pylint: disable=broad-except
+                self.log_callback(f"Screenshot failed: {exc}")
+                last_note = "screenshot failed"
+            else:
+                last_note = "screenshot taken"
+
+            retries = 0
+            action_data = None
+            raw = ""
+            while retries < 3 and action_data is None:
+                raw = self.request_action(objective, last_note)
+                self.log_callback(f"Raw model response: {raw}")
+                action_data = self.parse_action(raw)
+                if action_data is None:
+                    retries += 1
+                    self.log_callback("Model returned invalid JSON. Retrying...")
+            if action_data is None:
+                return "Failed to parse action after retries"
+
+            result = self.automation.execute_action(action_data)
+            executed_text = "executed" if result.executed else "dry-run"
+            self.log_callback(f"Action step {step}: {action_data} ({executed_text})")
+            if result.error:
+                self.log_callback(f"Action error: {result.error}")
+            if action_data.get("type") == "done":
+                return str(action_data.get("reason", "Done"))
+
+            if self.automation.should_stop():
+                return "Stopped by user"
+            time.sleep(self.delay_seconds)
+        return "Reached max steps"
diff --git a/rebuild_v1/config.py b/rebuild_v1/config.py
@@ -0,0 +1,39 @@
+import json
+from pathlib import Path
+from typing import Any, Dict
+
+CONFIG_PATH = Path(__file__).parent / "config.json"
+
+DEFAULT_CONFIG: Dict[str, Any] = {
+    "provider": "Ollama (Local)",
+    "ollama_host": "http://localhost:11434",
+    "ollama_model": "llama3.2:3b",
+    "openrouter_api_key": "",
+    "openrouter_model": "openrouter/auto",
+    "openrouter_base_url": "https://openrouter.ai/api/v1",
+    "max_steps": 10,
+    "delay_seconds": 0.6,
+    "stop_hotkey": "ctrl+alt+s",
+    "dry_run": True,
+}
+
+
+def load_config() -> Dict[str, Any]:
+    if CONFIG_PATH.exists():
+        try:
+            with CONFIG_PATH.open("r", encoding="utf-8") as f:
+                data = json.load(f)
+            merged = {**DEFAULT_CONFIG, **data}
+            return merged
+        except (json.JSONDecodeError, OSError):
+            return DEFAULT_CONFIG.copy()
+    return DEFAULT_CONFIG.copy()
+
+
+def save_config(config: Dict[str, Any]) -> None:
+    try:
+        with CONFIG_PATH.open("w", encoding="utf-8") as f:
+            json.dump(config, f, indent=2)
+    except OSError:
+        # Prefer silent failure over crashing the UI when filesystem is read-only
+        pass
diff --git a/rebuild_v1/engine_llm.py b/rebuild_v1/engine_llm.py
@@ -0,0 +1,151 @@
+import json
+from typing import Dict, List, Optional
+
+import requests
+from openai import OpenAI
+
+
+class LLMError(Exception):
+    """Raised when the language model call fails."""
+
+
+class LLMEngine:
+    def __init__(self, config: Dict[str, object]):
+        self.config = config
+
+    def _ollama_chat(self, messages: List[Dict[str, str]]) -> str:
+        host = str(self.config.get("ollama_host") or "http://localhost:11434")
+        model = str(self.config.get("ollama_model") or "llama3.2:3b")
+        try:
+            health = requests.get(f"{host}/api/tags", timeout=3)
+        except requests.RequestException as exc:
+            raise LLMError(
+                "Could not reach Ollama. Please ensure it is running on this machine."
+            ) from exc
+
+        if health.status_code != 200:
+            raise LLMError(
+                "Ollama responded unexpectedly. Please restart Ollama and try again."
+            )
+
+        payload = {
+            "model": model,
+            "messages": messages,
+            "stream": False,
+        }
+        try:
+            response = requests.post(
+                f"{host}/api/chat", json=payload, timeout=30, stream=False
+            )
+        except requests.RequestException as exc:
+            raise LLMError(
+                "Failed to call Ollama. Is the service running on the configured host?"
+            ) from exc
+
+        if response.status_code == 404:
+            raise LLMError(
+                "Model not found. Please install it with: ollama pull llama3.2:3b"
+            )
+        if response.status_code >= 500:
+            raise LLMError("Ollama server error. Please try again after a moment.")
+        if response.status_code >= 400:
+            raise LLMError(
+                "Ollama rejected the request. If the model is missing, run: ollama pull llama3.2:3b"
+            )
+
+        data = response.json()
+        message = data.get("message", {})
+        content = message.get("content")
+        if not content:
+            raise LLMError("Ollama returned an empty response.")
+        return content
+
+    def _openrouter_chat(self, messages: List[Dict[str, str]]) -> str:
+        api_key = str(self.config.get("openrouter_api_key") or "")
+        model = str(self.config.get("openrouter_model") or "openrouter/auto")
+        base_url = str(self.config.get("openrouter_base_url") or "https://openrouter.ai/api/v1")
+
+        if not api_key:
+            raise LLMError("OpenRouter API key is missing in settings.")
+
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        try:
+            chat = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                stream=False,
+            )
+        except Exception as exc:  # pylint: disable=broad-except
+            raise LLMError("OpenRouter call failed. Check network and API key.") from exc
+
+        try:
+            content = chat.choices[0].message.content
+        except (AttributeError, IndexError):
+            content = None
+        if not content:
+            raise LLMError("OpenRouter returned an empty response.")
+        return content
+
+    def chat(self, messages: List[Dict[str, str]]) -> str:
+        provider = self.config.get("provider", "Ollama (Local)")
+        if provider == "Ollama (Local)":
+            return self._ollama_chat(messages)
+        if provider == "OpenRouter (API)":
+            return self._openrouter_chat(messages)
+        raise LLMError("Unsupported provider selected.")
+
+    def request_action(self, objective: str, screenshot_note: str) -> str:
+        prompt = (
+            "You are controlling a computer. Decide the SINGLE next action as strict JSON only. "
+            "Use one of: click, type, hotkey, wait, done. Respond with JSON only."
+        )
+        instructions = (
+            "Schema: {\"type\":\"click\",\"x\":int,\"y\":int} | "
+            "{\"type\":\"type\",\"text\":str} | "
+            "{\"type\":\"hotkey\",\"keys\":[str,...]} | "
+            "{\"type\":\"wait\",\"seconds\":float} | "
+            "{\"type\":\"done\",\"reason\":str}. "
+            "Only one action. No explanation."
+        )
+        user_message = (
+            f"Objective: {objective}\n"
+            f"Latest screenshot: {screenshot_note}\n"
+            "Output JSON only."
+        )
+        messages = [
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": instructions},
+            {"role": "user", "content": user_message},
+        ]
+        return self.chat(messages)
+
+
+def parse_action_text(text: str) -> Optional[Dict[str, object]]:
+    try:
+        data = json.loads(text)
+    except json.JSONDecodeError:
+        return None
+    if not isinstance(data, dict):
+        return None
+    action_type = data.get("type")
+    if action_type not in {"click", "type", "hotkey", "wait", "done"}:
+        return None
+    if action_type == "click":
+        if isinstance(data.get("x"), int) and isinstance(data.get("y"), int):
+            return {"type": "click", "x": data["x"], "y": data["y"]}
+    elif action_type == "type":
+        if isinstance(data.get("text"), str):
+            return {"type": "type", "text": data["text"]}
+    elif action_type == "hotkey":
+        keys = data.get("keys")
+        if isinstance(keys, list) and all(isinstance(k, str) for k in keys):
+            return {"type": "hotkey", "keys": keys}
+    elif action_type == "wait":
+        seconds = data.get("seconds")
+        if isinstance(seconds, (int, float)):
+            return {"type": "wait", "seconds": float(seconds)}
+    elif action_type == "done":
+        reason = data.get("reason")
+        if isinstance(reason, str):
+            return {"type": "done", "reason": reason}
+    return None
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Optional: set your OpenRouter API key for Phase B
		OPENROUTER_API_KEY=