diff --git a/.vscode/settings.json b/.vscode/settings.json index a9d566c..d5f8acf 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,6 +1,6 @@ { "eslint.format.enable": true, "editor.codeActionsOnSave": { - "source.fixAll.eslint": true + "source.fixAll.eslint": "explicit" } } diff --git a/src/helpers/availableActions.ts b/src/helpers/availableActions.ts index 823045c..9433613 100644 --- a/src/helpers/availableActions.ts +++ b/src/helpers/availableActions.ts @@ -61,3 +61,12 @@ export type ActionPayload = { Extract >; }[AvailableAction['name']]; + +export const formattedActions = availableActions + .map((action, i) => { + const args = action.args + .map((arg) => `${arg.name}: ${arg.type}`) + .join(', '); + return `${i + 1}. ${action.name}(${args}): ${action.description}`; + }) + .join('\n'); diff --git a/src/helpers/determineNextAction.ts b/src/helpers/determineNextAction.ts index 933aff7..3f9d801 100644 --- a/src/helpers/determineNextAction.ts +++ b/src/helpers/determineNextAction.ts @@ -1,18 +1,9 @@ import OpenAI from 'openpipe/openai'; import { useAppState } from '../state/store'; -import { availableActions } from './availableActions'; +import { formattedActions } from './availableActions'; import { ParsedResponseSuccess } from './parseResponse'; -const formattedActions = availableActions - .map((action, i) => { - const args = action.args - .map((arg) => `${arg.name}: ${arg.type}`) - .join(', '); - return `${i + 1}. ${action.name}(${args}): ${action.description}`; - }) - .join('\n'); - -const systemMessage = ` +const getSystemMessage = () => ` You are a browser automation assistant. You can use the following tools: @@ -23,27 +14,123 @@ You will be be given a task to perform and the current state of the DOM. You wil This is an example of an action: +1 I should click the add to cart button click(223) -You must always include the and open/close tags or else your response will be marked as invalid.`; +CurrentStep is the step of the plan that you are currently on. It is a string that you can increment. Some steps may take more than one action to complete. +You must always include the , , and open/close tags or else your response will be marked as invalid.`; + +async function generatePlan(taskInstructions: string, simplifiedDOM: string) { + const model = useAppState.getState().settings.selectedModel; + const openAIKey = useAppState.getState().settings.openAIKey; + const openPipeKey = useAppState.getState().settings.openPipeKey; + + if (!openAIKey) { + throw new Error('No OpenAI key found'); + } + + const openai = new OpenAI({ + apiKey: openAIKey, + dangerouslyAllowBrowser: true, + openpipe: { + apiKey: openPipeKey ?? undefined, + }, + }); + + const planPrompt = ` +You are a browser automation planner. Create a step-by-step plan to accomplish the following task: + +${taskInstructions} + +Provide a numbered list of high-level steps to complete this task. Where each step corresponds to a click. Be specific but concise. +`; + + try { + const completion = await openai.chat.completions.create({ + model, + messages: [{ role: 'user', content: planPrompt }], + max_completion_tokens: 1000, + reasoning_effort: model === 'o1' ? 'low' : undefined, + temperature: 0, + store: openPipeKey ? true : false, + }); + + const plan = completion.choices[0].message?.content?.trim(); + console.log('plan', plan); + return plan; + } catch (error: any) { + console.error('Error generating plan:', error); + throw new Error(`Failed to generate plan: ${error.message}`); + } +} + +export function formatPrompt( + taskInstructions: string, + previousActions: ParsedResponseSuccess[], + pageContents: string, + plan?: string +) { + let previousActionsString = ''; + + if (previousActions.length > 0) { + const serializedActions = previousActions + .map( + (action) => + `${action.thought}\n${action.currentStep}\n${action.action}` + ) + .join('\n\n'); + previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`; + } + + const planInfo = plan + ? `Here is the plan for completing this task:\n${plan}\n\n + Follow this plan to complete the task. \n\n` + : ''; + + return `The user requests the following task: + +${taskInstructions} + +${planInfo}${previousActionsString} + +Current page contents: +${pageContents}`; +} export async function determineNextAction( taskInstructions: string, previousActions: ParsedResponseSuccess[], simplifiedDOM: string, maxAttempts = 3, - notifyError?: (error: string) => void + notifyError?: (error: string) => void, + existingPlan?: string | null ) { const model = useAppState.getState().settings.selectedModel; - const prompt = formatPrompt(taskInstructions, previousActions, simplifiedDOM); const openAIKey = useAppState.getState().settings.openAIKey; const openPipeKey = useAppState.getState().settings.openPipeKey; + if (!openAIKey) { notifyError?.('No OpenAI key found'); return null; } + let plan = existingPlan || undefined; + if (!plan && previousActions.length === 0) { + try { + plan = await generatePlan(taskInstructions, simplifiedDOM); + } catch (error: any) { + notifyError?.(error.message); + } + } + + const prompt = formatPrompt( + taskInstructions, + previousActions, + simplifiedDOM, + plan || undefined + ); + const openai = new OpenAI({ apiKey: openAIKey, dangerouslyAllowBrowser: true, @@ -52,6 +139,8 @@ export async function determineNextAction( }, }); + console.log('prompt', prompt); + console.log('getSystemMessage', getSystemMessage()); for (let i = 0; i < maxAttempts; i++) { try { const completion = await openai.chat.completions.create({ @@ -59,7 +148,7 @@ export async function determineNextAction( messages: [ { role: 'system', - content: systemMessage, + content: getSystemMessage(), }, { role: 'user', content: prompt }, ], @@ -70,20 +159,24 @@ export async function determineNextAction( store: openPipeKey ? true : false, }); + const response = + completion.choices[0].message?.content?.trim() + ''; + + console.log('OpenAI response:', response); + return { usage: completion.usage, prompt, response: completion.choices[0].message?.content?.trim() + '', + plan: plan || undefined, }; } catch (error: any) { console.log('determineNextAction error', error); if (error.message.includes('server error')) { - // Problem with the OpenAI API, try again if (notifyError) { notifyError(error.message); } } else { - // Another error, give up throw new Error(error.message); } } @@ -92,32 +185,3 @@ export async function determineNextAction( `Failed to complete query after ${maxAttempts} attempts. Please try again later.` ); } - -export function formatPrompt( - taskInstructions: string, - previousActions: ParsedResponseSuccess[], - pageContents: string -) { - let previousActionsString = ''; - - if (previousActions.length > 0) { - const serializedActions = previousActions - .map( - (action) => - `${action.thought}\n${action.action}` - ) - .join('\n\n'); - previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`; - } - - return `The user requests the following task: - -${taskInstructions} - -${previousActionsString} - -Current time: ${new Date().toLocaleString()} - -Current page contents: -${pageContents}`; -} diff --git a/src/helpers/parseResponse.ts b/src/helpers/parseResponse.ts index bdee5f4..0770f79 100644 --- a/src/helpers/parseResponse.ts +++ b/src/helpers/parseResponse.ts @@ -2,6 +2,7 @@ import { ActionPayload, availableActions } from './availableActions'; export type ParsedResponseSuccess = { thought: string; + currentStep: string; action: string; parsedAction: ActionPayload; }; @@ -15,6 +16,7 @@ export type ParsedResponse = export function parseResponse(text: string): ParsedResponse { const thoughtMatch = text.match(/(.*?)<\/Thought>/); const actionMatch = text.match(/(.*?)<\/Action>/); + const currentStepMatch = text.match(/(.*?)<\/CurrentStep>/); if (!thoughtMatch) { return { @@ -28,8 +30,15 @@ export function parseResponse(text: string): ParsedResponse { }; } + if (!currentStepMatch) { + return { + error: 'Invalid response: Current step not found in the model response.', + }; + } + const thought = thoughtMatch[1]; const actionString = actionMatch[1]; + const currentStep = currentStepMatch[1]; const actionPattern = /(\w+)\((.*?)\)/; const actionParts = actionString.match(actionPattern); @@ -109,6 +118,7 @@ export function parseResponse(text: string): ParsedResponse { return { thought, + currentStep: currentStep, action: actionString, parsedAction, }; diff --git a/src/manifest.json b/src/manifest.json index 98c9127..5b72ca1 100755 --- a/src/manifest.json +++ b/src/manifest.json @@ -11,12 +11,16 @@ "storage", "clipboardWrite", "debugger", - "management" + "management", + "sidePanel" ], "action": { "default_popup": "popup.html", "default_icon": "icon-34.png" }, + "chrome_url_overrides": { + "sidebar": "popup.html" + }, "icons": { "128": "icon-128.png" }, @@ -41,5 +45,8 @@ "mac": "Command+Shift+Y" } } + }, + "side_panel": { + "default_path": "popup.html" } } diff --git a/src/state/currentTask.ts b/src/state/currentTask.ts index 3106287..f4a6c41 100644 --- a/src/state/currentTask.ts +++ b/src/state/currentTask.ts @@ -1,4 +1,3 @@ -import { CreateCompletionResponseUsage } from 'openai'; import { attachDebugger, detachDebugger } from '../helpers/chromeDebugger'; import { disableIncompatibleExtensions, @@ -16,11 +15,17 @@ import { getSimplifiedDom } from '../helpers/simplifyDom'; import { sleep, truthyFilter } from '../helpers/utils'; import { MyStateCreator } from './store'; +interface CreateCompletionResponseUsage { + prompt_tokens: number; + completion_tokens: number; + total_tokens: number; +} + export type TaskHistoryEntry = { prompt: string; response: string; action: ParsedResponse; - usage: CreateCompletionResponseUsage; + usage?: CreateCompletionResponseUsage; }; export type CurrentTaskSlice = { @@ -36,6 +41,7 @@ export type CurrentTaskSlice = { | 'performing-query' | 'performing-action' | 'waiting'; + plan: string | null; actions: { runTask: (onError: (error: string) => void) => Promise; interrupt: () => void; @@ -50,6 +56,7 @@ export const createCurrentTaskSlice: MyStateCreator = ( history: [], status: 'idle', actionStatus: 'idle', + plan: null, actions: { runTask: async (onError) => { const wasStopped = () => get().currentTask.status !== 'running'; @@ -68,6 +75,7 @@ export const createCurrentTaskSlice: MyStateCreator = ( state.currentTask.history = []; state.currentTask.status = 'running'; state.currentTask.actionStatus = 'attaching-debugger'; + state.currentTask.plan = null; }); try { @@ -115,7 +123,8 @@ export const createCurrentTaskSlice: MyStateCreator = ( ) as ParsedResponseSuccess[], currentDom, 3, - onError + onError, + get().currentTask.plan ); if (!query) { @@ -170,6 +179,12 @@ export const createCurrentTaskSlice: MyStateCreator = ( setActionStatus('waiting'); // sleep 2 seconds. This is pretty arbitrary; we should figure out a better way to determine when the page has settled. await sleep(2000); + + if (query.plan && !get().currentTask.plan) { + set((state) => { + state.currentTask.plan = query.plan || null; + }); + } } set((state) => { state.currentTask.status = 'success';