Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions rebuild_v1/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Optional: set your OpenRouter API key for Phase B
OPENROUTER_API_KEY=
5 changes: 5 additions & 0 deletions rebuild_v1/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
config.json
.env
.venv
__pycache__/
*.pyc
48 changes: 48 additions & 0 deletions rebuild_v1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Self-Operating Computer v1 (clean rebuild)

A minimal Windows-friendly rewrite that runs a decide-and-act loop with a local Ollama model (no API keys required). Phase B wiring for OpenRouter is also in place.

## Features
- Tkinter GUI with Tasks, Settings, and Logs tabs
- Dry Run safety (on by default) and STOP hotkey/button
- Loop: screenshot → LLM JSON action → validate → (optionally) execute via `pyautogui`
- Config persisted to `rebuild_v1/config.json`

## Requirements
- Python 3.11 on Windows
- [Ollama](https://ollama.com) running locally (default host `http://localhost:11434`)
- Optional: OpenRouter API key for Phase B

Install Python packages:

```bash
python -m venv .venv
.venv\\Scripts\\activate
pip install -r rebuild_v1/requirements.txt
```

## Running the app
```bash
python -m rebuild_v1.main
```
If Tkinter fails to start in a headless environment, run on a local desktop session.

## Using Ollama (Phase A)
1. Start Ollama: `ollama serve`
2. Pull the default model once: `ollama pull llama3.2:3b`
3. Ensure the host and model fields in **Settings** match your setup.

The app checks connectivity to Ollama and shows a friendly message if the server or model is unavailable.

## Safety controls
- **Dry Run**: enabled by default; shows actions without executing.
- **STOP hotkey**: `Ctrl+Alt+S` (configurable) registered via the `keyboard` library.
- **STOP button**: halts the current loop.
- **Max steps** and **delay** configurable in Settings.

## OpenRouter (Phase B)
A provider dropdown exists; if "OpenRouter (API)" is chosen, set your API key, model, and base URL in Settings. Requests use the OpenAI-compatible SDK.

## Notes
- Screenshots use Pillow's `ImageGrab`; ensure a visible desktop session.
- Avoid sharing `config.json`—it is gitignored and may contain sensitive keys.
133 changes: 133 additions & 0 deletions rebuild_v1/automation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import threading
import time
from dataclasses import dataclass
from typing import Callable, Dict, Optional

import keyboard
import pyautogui
from PIL import ImageGrab

pyautogui.FAILSAFE = False


@dataclass
class ActionResult:
action: Dict[str, object]
executed: bool
error: Optional[str] = None


class AutomationEngine:
def __init__(self, dry_run: bool = True, stop_hotkey: str = "ctrl+alt+s"):
self.dry_run = dry_run
self.stop_hotkey = stop_hotkey
self._stop_flag = threading.Event()
self._hotkey_registered = False

def register_stop_hotkey(self) -> None:
if self._hotkey_registered:
return
try:
keyboard.add_hotkey(self.stop_hotkey, self.stop)
self._hotkey_registered = True
except keyboard.KeyboardException:
# Keyboard may need elevated privileges; ignore if unavailable
pass

def stop(self) -> None:
self._stop_flag.set()

def reset_stop(self) -> None:
self._stop_flag.clear()

def should_stop(self) -> bool:
return self._stop_flag.is_set()

def capture_screenshot(self, save_path: Optional[str] = None):
image = ImageGrab.grab()
if save_path:
image.save(save_path)
return image

def execute_action(self, action: Dict[str, object]) -> ActionResult:
if self.dry_run:
return ActionResult(action=action, executed=False, error=None)

try:
action_type = action.get("type")
if action_type == "click":
pyautogui.click(x=int(action["x"]), y=int(action["y"]))
elif action_type == "type":
pyautogui.typewrite(str(action["text"]))
elif action_type == "hotkey":
keys = [str(k) for k in action.get("keys", [])]
pyautogui.hotkey(*keys)
elif action_type == "wait":
time.sleep(float(action.get("seconds", 0)))
elif action_type == "done":
# No-op
pass
else:
return ActionResult(action=action, executed=False, error="Unknown action type")
return ActionResult(action=action, executed=True, error=None)
except Exception as exc: # pylint: disable=broad-except
return ActionResult(action=action, executed=False, error=str(exc))


class ActionLooper:
def __init__(
self,
automation: AutomationEngine,
request_action: Callable[[str, str], str],
parse_action: Callable[[str], Optional[Dict[str, object]]],
max_steps: int,
delay_seconds: float,
log_callback: Callable[[str], None],
):
self.automation = automation
self.request_action = request_action
self.parse_action = parse_action
self.max_steps = max_steps
self.delay_seconds = delay_seconds
self.log_callback = log_callback

def run(self, objective: str) -> str:
self.automation.reset_stop()
self.automation.register_stop_hotkey()
last_note = "Screenshot captured"
for step in range(1, self.max_steps + 1):
if self.automation.should_stop():
return "Stopped by user"
try:
self.automation.capture_screenshot()
except Exception as exc: # pylint: disable=broad-except
self.log_callback(f"Screenshot failed: {exc}")
last_note = "screenshot failed"
else:
last_note = "screenshot taken"

retries = 0
action_data = None
raw = ""
while retries < 3 and action_data is None:
raw = self.request_action(objective, last_note)
self.log_callback(f"Raw model response: {raw}")
action_data = self.parse_action(raw)
if action_data is None:
retries += 1
self.log_callback("Model returned invalid JSON. Retrying...")
if action_data is None:
return "Failed to parse action after retries"

result = self.automation.execute_action(action_data)
executed_text = "executed" if result.executed else "dry-run"
self.log_callback(f"Action step {step}: {action_data} ({executed_text})")
if result.error:
self.log_callback(f"Action error: {result.error}")
if action_data.get("type") == "done":
return str(action_data.get("reason", "Done"))

if self.automation.should_stop():
return "Stopped by user"
time.sleep(self.delay_seconds)
return "Reached max steps"
39 changes: 39 additions & 0 deletions rebuild_v1/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import json
from pathlib import Path
from typing import Any, Dict

CONFIG_PATH = Path(__file__).parent / "config.json"

DEFAULT_CONFIG: Dict[str, Any] = {
"provider": "Ollama (Local)",
"ollama_host": "http://localhost:11434",
"ollama_model": "llama3.2:3b",
"openrouter_api_key": "",
"openrouter_model": "openrouter/auto",
"openrouter_base_url": "https://openrouter.ai/api/v1",
"max_steps": 10,
"delay_seconds": 0.6,
"stop_hotkey": "ctrl+alt+s",
"dry_run": True,
}


def load_config() -> Dict[str, Any]:
if CONFIG_PATH.exists():
try:
with CONFIG_PATH.open("r", encoding="utf-8") as f:
data = json.load(f)
merged = {**DEFAULT_CONFIG, **data}
return merged
except (json.JSONDecodeError, OSError):
return DEFAULT_CONFIG.copy()
return DEFAULT_CONFIG.copy()


def save_config(config: Dict[str, Any]) -> None:
try:
with CONFIG_PATH.open("w", encoding="utf-8") as f:
json.dump(config, f, indent=2)
except OSError:
# Prefer silent failure over crashing the UI when filesystem is read-only
pass
151 changes: 151 additions & 0 deletions rebuild_v1/engine_llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import json
from typing import Dict, List, Optional

import requests
from openai import OpenAI


class LLMError(Exception):
"""Raised when the language model call fails."""


class LLMEngine:
def __init__(self, config: Dict[str, object]):
self.config = config

def _ollama_chat(self, messages: List[Dict[str, str]]) -> str:
host = str(self.config.get("ollama_host") or "http://localhost:11434")
model = str(self.config.get("ollama_model") or "llama3.2:3b")
try:
health = requests.get(f"{host}/api/tags", timeout=3)
except requests.RequestException as exc:
raise LLMError(
"Could not reach Ollama. Please ensure it is running on this machine."
) from exc

if health.status_code != 200:
raise LLMError(
"Ollama responded unexpectedly. Please restart Ollama and try again."
)

payload = {
"model": model,
"messages": messages,
"stream": False,
}
try:
response = requests.post(
f"{host}/api/chat", json=payload, timeout=30, stream=False
)
except requests.RequestException as exc:
raise LLMError(
"Failed to call Ollama. Is the service running on the configured host?"
) from exc

if response.status_code == 404:
raise LLMError(
"Model not found. Please install it with: ollama pull llama3.2:3b"
)
if response.status_code >= 500:
raise LLMError("Ollama server error. Please try again after a moment.")
if response.status_code >= 400:
raise LLMError(
"Ollama rejected the request. If the model is missing, run: ollama pull llama3.2:3b"
)

data = response.json()
message = data.get("message", {})
content = message.get("content")
if not content:
raise LLMError("Ollama returned an empty response.")
return content

def _openrouter_chat(self, messages: List[Dict[str, str]]) -> str:
api_key = str(self.config.get("openrouter_api_key") or "")
model = str(self.config.get("openrouter_model") or "openrouter/auto")
base_url = str(self.config.get("openrouter_base_url") or "https://openrouter.ai/api/v1")

if not api_key:
raise LLMError("OpenRouter API key is missing in settings.")

client = OpenAI(api_key=api_key, base_url=base_url)
try:
chat = client.chat.completions.create(
model=model,
messages=messages,
stream=False,
)
except Exception as exc: # pylint: disable=broad-except
raise LLMError("OpenRouter call failed. Check network and API key.") from exc

try:
content = chat.choices[0].message.content
except (AttributeError, IndexError):
content = None
if not content:
raise LLMError("OpenRouter returned an empty response.")
return content

def chat(self, messages: List[Dict[str, str]]) -> str:
provider = self.config.get("provider", "Ollama (Local)")
if provider == "Ollama (Local)":
return self._ollama_chat(messages)
if provider == "OpenRouter (API)":
return self._openrouter_chat(messages)
raise LLMError("Unsupported provider selected.")

def request_action(self, objective: str, screenshot_note: str) -> str:
prompt = (
"You are controlling a computer. Decide the SINGLE next action as strict JSON only. "
"Use one of: click, type, hotkey, wait, done. Respond with JSON only."
)
instructions = (
"Schema: {\"type\":\"click\",\"x\":int,\"y\":int} | "
"{\"type\":\"type\",\"text\":str} | "
"{\"type\":\"hotkey\",\"keys\":[str,...]} | "
"{\"type\":\"wait\",\"seconds\":float} | "
"{\"type\":\"done\",\"reason\":str}. "
"Only one action. No explanation."
)
user_message = (
f"Objective: {objective}\n"
f"Latest screenshot: {screenshot_note}\n"
"Output JSON only."
)
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": instructions},
{"role": "user", "content": user_message},
]
return self.chat(messages)


def parse_action_text(text: str) -> Optional[Dict[str, object]]:
try:
data = json.loads(text)
except json.JSONDecodeError:
return None
if not isinstance(data, dict):
return None
action_type = data.get("type")
if action_type not in {"click", "type", "hotkey", "wait", "done"}:
return None
if action_type == "click":
if isinstance(data.get("x"), int) and isinstance(data.get("y"), int):
return {"type": "click", "x": data["x"], "y": data["y"]}
elif action_type == "type":
if isinstance(data.get("text"), str):
return {"type": "type", "text": data["text"]}
elif action_type == "hotkey":
keys = data.get("keys")
if isinstance(keys, list) and all(isinstance(k, str) for k in keys):
return {"type": "hotkey", "keys": keys}
elif action_type == "wait":
seconds = data.get("seconds")
if isinstance(seconds, (int, float)):
return {"type": "wait", "seconds": float(seconds)}
elif action_type == "done":
reason = data.get("reason")
if isinstance(reason, str):
return {"type": "done", "reason": reason}
return None
Loading