TaxyAI · Dsinghbailey · Feb 26, 2025
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,6 +1,6 @@
 {
   "eslint.format.enable": true,
   "editor.codeActionsOnSave": {
-    "source.fixAll.eslint": true
+    "source.fixAll.eslint": "explicit"
   }
 }
diff --git a/src/helpers/availableActions.ts b/src/helpers/availableActions.ts
@@ -61,3 +61,12 @@ export type ActionPayload = {
     Extract<AvailableAction, { name: K }>
   >;
 }[AvailableAction['name']];
+
+export const formattedActions = availableActions
+  .map((action, i) => {
+    const args = action.args
+      .map((arg) => `${arg.name}: ${arg.type}`)
+      .join(', ');
+    return `${i + 1}. ${action.name}(${args}): ${action.description}`;
+  })
+  .join('\n');
diff --git a/src/helpers/determineNextAction.ts b/src/helpers/determineNextAction.ts
@@ -1,18 +1,9 @@
 import OpenAI from 'openpipe/openai';
 import { useAppState } from '../state/store';
-import { availableActions } from './availableActions';
+import { formattedActions } from './availableActions';
 import { ParsedResponseSuccess } from './parseResponse';
 
-const formattedActions = availableActions
-  .map((action, i) => {
-    const args = action.args
-      .map((arg) => `${arg.name}: ${arg.type}`)
-      .join(', ');
-    return `${i + 1}. ${action.name}(${args}): ${action.description}`;
-  })
-  .join('\n');
-
-const systemMessage = `
+const getSystemMessage = () => `
 You are a browser automation assistant.
 
 You can use the following tools:
@@ -23,27 +14,123 @@ You will be be given a task to perform and the current state of the DOM. You wil
 
 This is an example of an action:
 
+<CurrentStep>1</CurrentStep>
 <Thought>I should click the add to cart button</Thought>
 <Action>click(223)</Action>
 
-You must always include the <Thought> and <Action> open/close tags or else your response will be marked as invalid.`;
+CurrentStep is the step of the plan that you are currently on. It is a string that you can increment. Some steps may take more than one action to complete.
+You must always include the <Thought>, <CurrentStep>, and <Action> open/close tags or else your response will be marked as invalid.`;
+
+async function generatePlan(taskInstructions: string, simplifiedDOM: string) {
+  const model = useAppState.getState().settings.selectedModel;
+  const openAIKey = useAppState.getState().settings.openAIKey;
+  const openPipeKey = useAppState.getState().settings.openPipeKey;
+
+  if (!openAIKey) {
+    throw new Error('No OpenAI key found');
+  }
+
+  const openai = new OpenAI({
+    apiKey: openAIKey,
+    dangerouslyAllowBrowser: true,
+    openpipe: {
+      apiKey: openPipeKey ?? undefined,
+    },
+  });
+
+  const planPrompt = `
+You are a browser automation planner. Create a step-by-step plan to accomplish the following task:
+
+${taskInstructions}
+
+Provide a numbered list of high-level steps to complete this task. Where each step corresponds to a click. Be specific but concise.
+`;
+
+  try {
+    const completion = await openai.chat.completions.create({
+      model,
+      messages: [{ role: 'user', content: planPrompt }],
+      max_completion_tokens: 1000,
+      reasoning_effort: model === 'o1' ? 'low' : undefined,
+      temperature: 0,
+      store: openPipeKey ? true : false,
+    });
+
+    const plan = completion.choices[0].message?.content?.trim();
+    console.log('plan', plan);
+    return plan;
+  } catch (error: any) {
+    console.error('Error generating plan:', error);
+    throw new Error(`Failed to generate plan: ${error.message}`);
+  }
+}
+
+export function formatPrompt(
+  taskInstructions: string,
+  previousActions: ParsedResponseSuccess[],
+  pageContents: string,
+  plan?: string
+) {
+  let previousActionsString = '';
+
+  if (previousActions.length > 0) {
+    const serializedActions = previousActions
+      .map(
+        (action) =>
+          `<Thought>${action.thought}</Thought>\n<CurrentStep>${action.currentStep}</CurrentStep>\n<Action>${action.action}</Action>`
+      )
+      .join('\n\n');
+    previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
+  }
+
+  const planInfo = plan
+    ? `Here is the plan for completing this task:\n${plan}\n\n 
+    Follow this plan to complete the task. \n\n`
+    : '';
+
+  return `The user requests the following task:
+
+${taskInstructions}
+
+${planInfo}${previousActionsString}
+
+Current page contents:
+${pageContents}`;
+}
 
 export async function determineNextAction(
   taskInstructions: string,
   previousActions: ParsedResponseSuccess[],
   simplifiedDOM: string,
   maxAttempts = 3,
-  notifyError?: (error: string) => void
+  notifyError?: (error: string) => void,
+  existingPlan?: string | null
 ) {
   const model = useAppState.getState().settings.selectedModel;
-  const prompt = formatPrompt(taskInstructions, previousActions, simplifiedDOM);
   const openAIKey = useAppState.getState().settings.openAIKey;
   const openPipeKey = useAppState.getState().settings.openPipeKey;
+
   if (!openAIKey) {
     notifyError?.('No OpenAI key found');
     return null;
   }
 
+  let plan = existingPlan || undefined;
+  if (!plan && previousActions.length === 0) {
+    try {
+      plan = await generatePlan(taskInstructions, simplifiedDOM);
+    } catch (error: any) {
+      notifyError?.(error.message);
+    }
+  }
+
+  const prompt = formatPrompt(
+    taskInstructions,
+    previousActions,
+    simplifiedDOM,
+    plan || undefined
+  );
+
   const openai = new OpenAI({
     apiKey: openAIKey,
     dangerouslyAllowBrowser: true,
@@ -52,14 +139,16 @@ export async function determineNextAction(
     },
   });
 
+  console.log('prompt', prompt);
+  console.log('getSystemMessage', getSystemMessage());
   for (let i = 0; i < maxAttempts; i++) {
     try {
       const completion = await openai.chat.completions.create({
         model,
         messages: [
           {
             role: 'system',
-            content: systemMessage,
+            content: getSystemMessage(),
           },
           { role: 'user', content: prompt },
         ],
@@ -70,20 +159,24 @@ export async function determineNextAction(
         store: openPipeKey ? true : false,
       });
 
+      const response =
+        completion.choices[0].message?.content?.trim() + '</Action>';
+
+      console.log('OpenAI response:', response);
+
       return {
         usage: completion.usage,
         prompt,
         response: completion.choices[0].message?.content?.trim() + '</Action>',
+        plan: plan || undefined,
       };
     } catch (error: any) {
       console.log('determineNextAction error', error);
       if (error.message.includes('server error')) {
-        // Problem with the OpenAI API, try again
         if (notifyError) {
           notifyError(error.message);
         }
       } else {
-        // Another error, give up
         throw new Error(error.message);
       }
     }
@@ -92,32 +185,3 @@ export async function determineNextAction(
     `Failed to complete query after ${maxAttempts} attempts. Please try again later.`
   );
 }
-
-export function formatPrompt(
-  taskInstructions: string,
-  previousActions: ParsedResponseSuccess[],
-  pageContents: string
-) {
-  let previousActionsString = '';
-
-  if (previousActions.length > 0) {
-    const serializedActions = previousActions
-      .map(
-        (action) =>
-          `<Thought>${action.thought}</Thought>\n<Action>${action.action}</Action>`
-      )
-      .join('\n\n');
-    previousActionsString = `You have already taken the following actions: \n${serializedActions}\n\n`;
-  }
-
-  return `The user requests the following task:
-
-${taskInstructions}
-
-${previousActionsString}
-
-Current time: ${new Date().toLocaleString()}
-
-Current page contents:
-${pageContents}`;
-}
diff --git a/src/helpers/parseResponse.ts b/src/helpers/parseResponse.ts
@@ -2,6 +2,7 @@ import { ActionPayload, availableActions } from './availableActions';
 
 export type ParsedResponseSuccess = {
   thought: string;
+  currentStep: string;
   action: string;
   parsedAction: ActionPayload;
 };
@@ -15,6 +16,7 @@ export type ParsedResponse =
 export function parseResponse(text: string): ParsedResponse {
   const thoughtMatch = text.match(/<Thought>(.*?)<\/Thought>/);
   const actionMatch = text.match(/<Action>(.*?)<\/Action>/);
+  const currentStepMatch = text.match(/<CurrentStep>(.*?)<\/CurrentStep>/);
 
   if (!thoughtMatch) {
     return {
@@ -28,8 +30,15 @@ export function parseResponse(text: string): ParsedResponse {
     };
   }
 
+  if (!currentStepMatch) {
+    return {
+      error: 'Invalid response: Current step not found in the model response.',
+    };
+  }
+
   const thought = thoughtMatch[1];
   const actionString = actionMatch[1];
+  const currentStep = currentStepMatch[1];
   const actionPattern = /(\w+)\((.*?)\)/;
   const actionParts = actionString.match(actionPattern);
 
@@ -109,6 +118,7 @@ export function parseResponse(text: string): ParsedResponse {
 
   return {
     thought,
+    currentStep: currentStep,
     action: actionString,
     parsedAction,
   };

diff --git a/src/manifest.json b/src/manifest.json
@@ -11,12 +11,16 @@
     "storage",
     "clipboardWrite",
     "debugger",
-    "management"
+    "management",
+    "sidePanel"
   ],
   "action": {
     "default_popup": "popup.html",
     "default_icon": "icon-34.png"
   },
+  "chrome_url_overrides": {
+    "sidebar": "popup.html"
+  },
   "icons": {
     "128": "icon-128.png"
   },
@@ -41,5 +45,8 @@
         "mac": "Command+Shift+Y"
       }
     }
+  },
+  "side_panel": {
+    "default_path": "popup.html"
   }
 }
diff --git a/src/state/currentTask.ts b/src/state/currentTask.ts
@@ -1,4 +1,3 @@
-import { CreateCompletionResponseUsage } from 'openai';
 import { attachDebugger, detachDebugger } from '../helpers/chromeDebugger';
 import {
   disableIncompatibleExtensions,
@@ -16,11 +15,17 @@ import { getSimplifiedDom } from '../helpers/simplifyDom';
 import { sleep, truthyFilter } from '../helpers/utils';
 import { MyStateCreator } from './store';
 
+interface CreateCompletionResponseUsage {
+  prompt_tokens: number;
+  completion_tokens: number;
+  total_tokens: number;
+}
+
 export type TaskHistoryEntry = {
   prompt: string;
   response: string;
   action: ParsedResponse;
-  usage: CreateCompletionResponseUsage;
+  usage?: CreateCompletionResponseUsage;
 };
 
 export type CurrentTaskSlice = {
@@ -36,6 +41,7 @@ export type CurrentTaskSlice = {
     | 'performing-query'
     | 'performing-action'
     | 'waiting';
+  plan: string | null;
   actions: {
     runTask: (onError: (error: string) => void) => Promise<void>;
     interrupt: () => void;
@@ -50,6 +56,7 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
   history: [],
   status: 'idle',
   actionStatus: 'idle',
+  plan: null,
   actions: {
     runTask: async (onError) => {
       const wasStopped = () => get().currentTask.status !== 'running';
@@ -68,6 +75,7 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
         state.currentTask.history = [];
         state.currentTask.status = 'running';
         state.currentTask.actionStatus = 'attaching-debugger';
+        state.currentTask.plan = null;
       });
 
       try {
@@ -115,7 +123,8 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
             ) as ParsedResponseSuccess[],
             currentDom,
             3,
-            onError
+            onError,
+            get().currentTask.plan
           );
 
           if (!query) {
@@ -170,6 +179,12 @@ export const createCurrentTaskSlice: MyStateCreator<CurrentTaskSlice> = (
           setActionStatus('waiting');
           // sleep 2 seconds. This is pretty arbitrary; we should figure out a better way to determine when the page has settled.
           await sleep(2000);
+
+          if (query.plan && !get().currentTask.plan) {
+            set((state) => {
+              state.currentTask.plan = query.plan || null;
+            });
+          }
         }
         set((state) => {
           state.currentTask.status = 'success';