Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"eslint.format.enable": true,
"editor.codeActionsOnSave": {
"source.fixAll.eslint": true
"source.fixAll.eslint": "explicit"
}
}
9 changes: 9 additions & 0 deletions src/helpers/availableActions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,12 @@ export type ActionPayload = {
Extract<AvailableAction, { name: K }>
>;
}[AvailableAction['name']];

export const formattedActions = availableActions
.map((action, i) => {
const args = action.args
.map((arg) => `${arg.name}: ${arg.type}`)
.join(', ');
return `${i + 1}. ${action.name}(${args}): ${action.description}`;
})
.join('\n');
156 changes: 110 additions & 46 deletions src/helpers/determineNextAction.ts
Original file line number Diff line number Diff line change
@@ -1,18 +1,9 @@
import OpenAI from 'openpipe/openai';
import { useAppState } from '../state/store';
import { availableActions } from './availableActions';
import { formattedActions } from './availableActions';
import { ParsedResponseSuccess } from './parseResponse';

const formattedActions = availableActions
.map((action, i) => {
const args = action.args
.map((arg) => `${arg.name}: ${arg.type}`)
.join(', ');
return `${i + 1}. ${action.name}(${args}): ${action.description}`;
})
.join('\n');

const systemMessage = `
const getSystemMessage = () => `
You are a browser automation assistant.

You can use the following tools:
Expand All @@ -23,27 +14,123 @@ You will be be given a task to perform and the current state of the DOM. You wil

This is an example of an action:

<CurrentStep>1</CurrentStep>
<Thought>I should click the add to cart button</Thought>
<Action>click(223)</Action>

You must always include the <Thought> and <Action> open/close tags or else your response will be marked as invalid.`;
CurrentStep is the step of the plan that you are currently on. It is a string that you can increment. Some steps may take more than one action to complete.
You must always include the <Thought>, <CurrentStep>, and <Action> open/close tags or else your response will be marked as invalid.`;

async function generatePlan(taskInstructions: string, simplifiedDOM: string) {
const model = useAppState.getState().settings.selectedModel;
const openAIKey = useAppState.getState().settings.openAIKey;
const openPipeKey = useAppState.getState().settings.openPipeKey;

if (!openAIKey) {
throw new Error('No OpenAI key found');
}

const openai = new OpenAI({
apiKey: openAIKey,
dangerouslyAllowBrowser: true,
openpipe: {
apiKey: openPipeKey ?? undefined,
},
});

const planPrompt = `
You are a browser automation planner. Create a step-by-step plan to accomplish the following task:

${taskInstructions}

Provide a numbered list of high-level steps to complete this task. Where each step corresponds to a click. Be specific but concise.
`;

try {
const completion = await openai.chat.completions.create({
model,
messages: [{ role: 'user', content: planPrompt }],
max_completion_tokens: 1000,
reasoning_effort: model === 'o1' ? 'low' : undefined,
temperature: 0,
store: openPipeKey ? true : false,
});

const plan = completion.choices[0].message?.content?.trim();
console.log('plan', plan);
return plan;
} catch (error: any) {
console.error('Error generating plan:', error);
throw new Error(`Failed to generate plan: ${error.message}`);
}
}

export function formatPrompt(
taskInstructions: string,
previousActions: ParsedResponseSuccess[],
pageContents: string,
plan?: string
) {
let previousActionsString = '';

if (previousActions.length > 0) {
const serializedActions = previousActions
.map(
(action) =>
`<Thought>${action.thought}</Thought>\n<CurrentStep>${action.currentStep}</CurrentStep>\n<Action>${action.action}</Action>`
)
.join('\n\n');
previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
}

const planInfo = plan
? `Here is the plan for completing this task:\n${plan}\n\n
Follow this plan to complete the task. \n\n`
: '';

return `The user requests the following task:

${taskInstructions}

${planInfo}${previousActionsString}

Current page contents:
${pageContents}`;
}

export async function determineNextAction(
taskInstructions: string,
previousActions: ParsedResponseSuccess[],
simplifiedDOM: string,
maxAttempts = 3,
notifyError?: (error: string) => void
notifyError?: (error: string) => void,
existingPlan?: string | null
) {
const model = useAppState.getState().settings.selectedModel;
const prompt = formatPrompt(taskInstructions, previousActions, simplifiedDOM);
const openAIKey = useAppState.getState().settings.openAIKey;
const openPipeKey = useAppState.getState().settings.openPipeKey;

if (!openAIKey) {
notifyError?.('No OpenAI key found');
return null;
}

let plan = existingPlan || undefined;
if (!plan && previousActions.length === 0) {
try {
plan = await generatePlan(taskInstructions, simplifiedDOM);
} catch (error: any) {
notifyError?.(error.message);
}
}

const prompt = formatPrompt(
taskInstructions,
previousActions,
simplifiedDOM,
plan || undefined
);

const openai = new OpenAI({
apiKey: openAIKey,
dangerouslyAllowBrowser: true,
Expand All @@ -52,14 +139,16 @@ export async function determineNextAction(
},
});

console.log('prompt', prompt);
console.log('getSystemMessage', getSystemMessage());
for (let i = 0; i < maxAttempts; i++) {
try {
const completion = await openai.chat.completions.create({
model,
messages: [
{
role: 'system',
content: systemMessage,
content: getSystemMessage(),
},
{ role: 'user', content: prompt },
],
Expand All @@ -70,20 +159,24 @@ export async function determineNextAction(
store: openPipeKey ? true : false,
});

const response =
completion.choices[0].message?.content?.trim() + '</Action>';

console.log('OpenAI response:', response);

return {
usage: completion.usage,
prompt,
response: completion.choices[0].message?.content?.trim() + '</Action>',
plan: plan || undefined,
};
} catch (error: any) {
console.log('determineNextAction error', error);
if (error.message.includes('server error')) {
// Problem with the OpenAI API, try again
if (notifyError) {
notifyError(error.message);
}
} else {
// Another error, give up
throw new Error(error.message);
}
}
Expand All @@ -92,32 +185,3 @@ export async function determineNextAction(
`Failed to complete query after ${maxAttempts} attempts. Please try again later.`
);
}

export function formatPrompt(
taskInstructions: string,
previousActions: ParsedResponseSuccess[],
pageContents: string
) {
let previousActionsString = '';

if (previousActions.length > 0) {
const serializedActions = previousActions
.map(
(action) =>
`<Thought>${action.thought}</Thought>\n<Action>${action.action}</Action>`
)
.join('\n\n');
previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
}

return `The user requests the following task:

${taskInstructions}

${previousActionsString}

Current time: ${new Date().toLocaleString()}

Current page contents:
${pageContents}`;
}
10 changes: 10 additions & 0 deletions src/helpers/parseResponse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { ActionPayload, availableActions } from './availableActions';

export type ParsedResponseSuccess = {
thought: string;
currentStep: string;
action: string;
parsedAction: ActionPayload;
};
Expand All @@ -15,6 +16,7 @@ export type ParsedResponse =
export function parseResponse(text: string): ParsedResponse {
const thoughtMatch = text.match(/<Thought>(.*?)<\/Thought>/);
const actionMatch = text.match(/<Action>(.*?)<\/Action>/);
const currentStepMatch = text.match(/<CurrentStep>(.*?)<\/CurrentStep>/);

if (!thoughtMatch) {
return {
Expand All @@ -28,8 +30,15 @@ export function parseResponse(text: string): ParsedResponse {
};
}

if (!currentStepMatch) {
return {
error: 'Invalid response: Current step not found in the model response.',
};
}

const thought = thoughtMatch[1];
const actionString = actionMatch[1];
const currentStep = currentStepMatch[1];
const actionPattern = /(\w+)\((.*?)\)/;
const actionParts = actionString.match(actionPattern);

Expand Down Expand Up @@ -109,6 +118,7 @@ export function parseResponse(text: string): ParsedResponse {

return {
thought,
currentStep: currentStep,
action: actionString,
parsedAction,
};
Expand Down
9 changes: 8 additions & 1 deletion src/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,16 @@
"storage",
"clipboardWrite",
"debugger",
"management"
"management",
"sidePanel"
],
"action": {
"default_popup": "popup.html",
"default_icon": "icon-34.png"
},
"chrome_url_overrides": {
"sidebar": "popup.html"
},
"icons": {
"128": "icon-128.png"
},
Expand All @@ -41,5 +45,8 @@
"mac": "Command+Shift+Y"
}
}
},
"side_panel": {
"default_path": "popup.html"
}
}
21 changes: 18 additions & 3 deletions src/state/currentTask.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { CreateCompletionResponseUsage } from 'openai';
import { attachDebugger, detachDebugger } from '../helpers/chromeDebugger';
import {
disableIncompatibleExtensions,
Expand All @@ -16,11 +15,17 @@ import { getSimplifiedDom } from '../helpers/simplifyDom';
import { sleep, truthyFilter } from '../helpers/utils';
import { MyStateCreator } from './store';

interface CreateCompletionResponseUsage {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
}

export type TaskHistoryEntry = {
prompt: string;
response: string;
action: ParsedResponse;
usage: CreateCompletionResponseUsage;
usage?: CreateCompletionResponseUsage;
};

export type CurrentTaskSlice = {
Expand All @@ -36,6 +41,7 @@ export type CurrentTaskSlice = {
| 'performing-query'
| 'performing-action'
| 'waiting';
plan: string | null;
actions: {
runTask: (onError: (error: string) => void) => Promise<void>;
interrupt: () => void;
Expand All @@ -50,6 +56,7 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
history: [],
status: 'idle',
actionStatus: 'idle',
plan: null,
actions: {
runTask: async (onError) => {
const wasStopped = () => get().currentTask.status !== 'running';
Expand All @@ -68,6 +75,7 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
state.currentTask.history = [];
state.currentTask.status = 'running';
state.currentTask.actionStatus = 'attaching-debugger';
state.currentTask.plan = null;
});

try {
Expand Down Expand Up @@ -115,7 +123,8 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
) as ParsedResponseSuccess[],
currentDom,
3,
onError
onError,
get().currentTask.plan
);

if (!query) {
Expand Down Expand Up @@ -170,6 +179,12 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
setActionStatus('waiting');
// sleep 2 seconds. This is pretty arbitrary; we should figure out a better way to determine when the page has settled.
await sleep(2000);

if (query.plan && !get().currentTask.plan) {
set((state) => {
state.currentTask.plan = query.plan || null;
});
}
}
set((state) => {
state.currentTask.status = 'success';
Expand Down