From ab941a2089f31b623fa3930799705446cae1d6ff Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Mon, 11 Aug 2025 03:00:16 +0530
Subject: [PATCH 01/13] New tab clicks (target=_blank / modifier clicks)
 stalled recording; child tab steps missing. Background/ad/tracker tabs
 polluted logs. Excessive duplicate navigation events per redirect/loading
 cycle. Massive explosion of input steps (hundreds of empty, unchanged
 values). Unnecessary workflow updates when steps unchanged.

New Tab Intent Heuristic:

Content script emits PREPARE_NEW_TAB on ctrl/cmd/middle click or target=_blank.
Background correlates upcoming chrome.tabs.onCreated to mark userInitiated.
Activated tabs tracked; only activated or userInitiated tabs produce steps.
Tab Filtering:

Suppress all events (except activation) from tabs never activated and not correlated with an intent window (4s).
Reduces noise from ads/trackers.
Navigation Consolidation:

Maintain lastNavigationIndexByTab; update existing navigation step instead of appending duplicates during rapid redirects or title/url churn.
Input Event Deduplication:

Content script: per-xpath cache; skip unchanged value; debounce; skip rapid empty repeats.
Background: merge consecutive identical field edits; collapse bursts of empty values within 5s (timestamp refresh only).
Track lastInputPerKey (tabId|xpath) to decide merge vs new step.
---
 extension/src/entrypoints/background.ts | 222 ++++++++++++++----------
 extension/src/entrypoints/content.ts    |  44 +++++
 2 files changed, 176 insertions(+), 90 deletions(-)
diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts
index 283a16f4..50c05309 100644
--- a/extension/src/entrypoints/background.ts
+++ b/extension/src/entrypoints/background.ts
@@ -30,6 +30,19 @@ export default defineBackground(() => {
   // Store tab information (URL, potentially title)
   const tabInfo: { [tabId: number]: { url?: string; title?: string } } = {};
 
+  // Track which tabs have been explicitly activated (brought to foreground) by the user.
+  // We will ignore events originating from tabs that were never activated to reduce noise
+  // (for example: ad / tracker tabs that load in the background).
+  const activatedTabs = new Set<number>();
+
+  // Track user clicks that are likely to open a new tab (Ctrl/Cmd + click, target=_blank etc.).
+  // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate
+  // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated.
+  const recentNewTabIntents: { [openerTabId: number]: number } = {};
+
+  // Heuristic window (ms) within which a created tab following a user intent is considered relevant.
+  const NEW_TAB_INTENT_WINDOW_MS = 4000;
+
   let isRecordingEnabled = true; // Default to disabled (OFF)
   let lastWorkflowHash: string | null = null; // Cache for the last logged workflow hash
 
@@ -144,6 +157,16 @@ export default defineBackground(() => {
     console.log(`Sending ${type}:`, payload);
     const tabId = payload.tabId;
     if (tabId) {
+      // Skip capturing events for tabs that have never been activated AND are not the original opener
+      // unless we have positively identified them as a recent user initiated tab (click intent -> creation).
+      if (
+        type !== "CUSTOM_TAB_ACTIVATED" &&
+        !activatedTabs.has(tabId) &&
+        !(payload.openerTabId && recentNewTabIntents[payload.openerTabId] && Date.now() - recentNewTabIntents[payload.openerTabId] < NEW_TAB_INTENT_WINDOW_MS)
+      ) {
+        // Silently ignore background noise (ad/tracker tabs) until user actually focuses them.
+        return;
+      }
       if (!sessionLogs[tabId]) {
         sessionLogs[tabId] = [];
       }
@@ -171,6 +194,12 @@ export default defineBackground(() => {
       url: tab.pendingUrl || tab.url,
       windowId: tab.windowId,
       index: tab.index,
+      userInitiated:
+        !!(
+          tab.openerTabId &&
+          recentNewTabIntents[tab.openerTabId] &&
+          Date.now() - recentNewTabIntents[tab.openerTabId] < NEW_TAB_INTENT_WINDOW_MS
+        ),
     });
   });
 
@@ -188,6 +217,7 @@ export default defineBackground(() => {
   });
 
   chrome.tabs.onActivated.addListener((activeInfo) => {
+  activatedTabs.add(activeInfo.tabId);
     sendTabEvent("CUSTOM_TAB_ACTIVATED", {
       tabId: activeInfo.tabId,
       windowId: activeInfo.windowId,
@@ -214,64 +244,90 @@ export default defineBackground(() => {
 
   function convertStoredEventsToSteps(events: StoredEvent[]): Step[] {
     const steps: Step[] = [];
+    const lastNavigationIndexByTab: Record<number, number> = {};
+    const lastInputPerKey: Record<string, { idx: number; ts: number; value: string }> = {};
 
     for (const event of events) {
       switch (event.messageType) {
-        case "CUSTOM_CLICK_EVENT": {
-          const clickEvent = event as StoredCustomClickEvent;
-          // Ensure required fields are present, even if optional in source type for some reason
+        case "CUSTOM_TAB_CREATED":
+        case "CUSTOM_TAB_UPDATED":
+        case "CUSTOM_TAB_ACTIVATED": {
+          const navUrl = (event as any).url || (event as any).changeInfo?.url;
+          if (!navUrl) break;
+          const tabId = (event as any).tabId;
+          const userInitiated = (event as any).userInitiated;
+          if (!activatedTabs.has(tabId) && !userInitiated) break; // suppress background noise
+
+          const existingIdx = lastNavigationIndexByTab[tabId];
           if (
-            clickEvent.url &&
-            clickEvent.frameUrl &&
-            clickEvent.xpath &&
-            clickEvent.elementTag
+            existingIdx !== undefined &&
+            steps[existingIdx] &&
+            steps[existingIdx].type === "navigation"
           ) {
+            // Update existing navigation (redirect / title change)
+            (steps[existingIdx] as NavigationStep).url = navUrl;
+            steps[existingIdx].timestamp = event.timestamp;
+          } else {
+            const nav: NavigationStep = {
+              type: "navigation",
+              timestamp: event.timestamp,
+              tabId,
+              url: navUrl,
+            };
+            steps.push(nav);
+            lastNavigationIndexByTab[tabId] = steps.length - 1;
+          }
+          break;
+        }
+        case "CUSTOM_CLICK_EVENT": {
+          const click = event as StoredCustomClickEvent;
+          if (click.url && click.xpath && click.elementTag) {
             const step: ClickStep = {
               type: "click",
-              timestamp: clickEvent.timestamp,
-              tabId: clickEvent.tabId,
-              url: clickEvent.url,
-              frameUrl: clickEvent.frameUrl,
-              xpath: clickEvent.xpath,
-              cssSelector: clickEvent.cssSelector,
-              elementTag: clickEvent.elementTag,
-              elementText: clickEvent.elementText,
-              screenshot: clickEvent.screenshot,
+              timestamp: click.timestamp,
+              tabId: click.tabId,
+              url: click.url,
+              frameUrl: click.frameUrl,
+              xpath: click.xpath,
+              cssSelector: click.cssSelector,
+              elementTag: click.elementTag,
+              elementText: click.elementText,
+              screenshot: click.screenshot,
             };
             steps.push(step);
           } else {
-            console.warn("Skipping incomplete CUSTOM_CLICK_EVENT:", clickEvent);
+            console.warn("Skipping incomplete CUSTOM_CLICK_EVENT", click);
           }
           break;
         }
-
         case "CUSTOM_INPUT_EVENT": {
           const inputEvent = event as StoredCustomInputEvent;
-          if (
-            inputEvent.url &&
-            // inputEvent.frameUrl && // frameUrl might be null/undefined in some cases, let's allow merging if only one is present or both match
-            inputEvent.xpath &&
-            inputEvent.elementTag
-          ) {
+          if (inputEvent.url && inputEvent.xpath && inputEvent.elementTag) {
+            const key = `${inputEvent.tabId}|${inputEvent.xpath}`;
+            const prior = lastInputPerKey[key];
+            const nowTs = inputEvent.timestamp;
+            const isEmpty = (inputEvent as any).value === "";
+            if (isEmpty && prior && prior.value === "" && nowTs - prior.ts < 5000) {
+              // collapse rapid-fire repeated empties
+              steps[prior.idx].timestamp = nowTs;
+              break;
+            }
             const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
-
-            // Check if the last step was a mergeable input event
             if (
               lastStep &&
               lastStep.type === "input" &&
               lastStep.tabId === inputEvent.tabId &&
               lastStep.url === inputEvent.url &&
-              lastStep.frameUrl === inputEvent.frameUrl && // Ensure frameUrls match if both exist
+              lastStep.frameUrl === inputEvent.frameUrl &&
               lastStep.xpath === inputEvent.xpath &&
               lastStep.cssSelector === inputEvent.cssSelector &&
               lastStep.elementTag === inputEvent.elementTag
             ) {
-              // Update the last input step
               (lastStep as InputStep).value = inputEvent.value;
-              lastStep.timestamp = inputEvent.timestamp; // Update to latest timestamp
-              (lastStep as InputStep).screenshot = inputEvent.screenshot; // Update to latest screenshot
+              lastStep.timestamp = inputEvent.timestamp;
+              (lastStep as InputStep).screenshot = inputEvent.screenshot;
+              lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value };
             } else {
-              // Add a new input step
               const newStep: InputStep = {
                 type: "input",
                 timestamp: inputEvent.timestamp,
@@ -285,24 +341,22 @@ export default defineBackground(() => {
                 screenshot: inputEvent.screenshot,
               };
               steps.push(newStep);
+              lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value };
             }
           } else {
-            console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent);
+            console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent);
           }
           break;
         }
-
         case "CUSTOM_KEY_EVENT": {
           const keyEvent = event as StoredCustomKeyEvent;
-          // Key press might not always have a target element (xpath, etc.)
-          // but needs at least url and key
           if (keyEvent.url && keyEvent.key) {
             const step: KeyPressStep = {
               type: "key_press",
               timestamp: keyEvent.timestamp,
               tabId: keyEvent.tabId,
               url: keyEvent.url,
-              frameUrl: keyEvent.frameUrl, // Can be missing
+              frameUrl: keyEvent.frameUrl,
               key: keyEvent.key,
               xpath: keyEvent.xpath,
               cssSelector: keyEvent.cssSelector,
@@ -311,77 +365,56 @@ export default defineBackground(() => {
             };
             steps.push(step);
           } else {
-            console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent);
+            console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent);
           }
           break;
         }
-
         case "RRWEB_EVENT": {
-          // We only care about scroll events from rrweb for now
           const rrEvent = event as StoredRrwebEvent;
-          if (
-            rrEvent.type === EventType.IncrementalSnapshot &&
-            rrEvent.data.source === IncrementalSource.Scroll
-          ) {
-            const scrollData = rrEvent.data as {
-              id: number;
-              x: number;
-              y: number;
-            }; // Type assertion for clarity
-            const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL
-
-            // Check if the last step added was a mergeable scroll event
-            const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
             if (
-              lastStep &&
-              lastStep.type === "scroll" &&
-              lastStep.tabId === rrEvent.tabId &&
-              (lastStep as ScrollStep).targetId === scrollData.id
+              rrEvent.type === EventType.IncrementalSnapshot &&
+              rrEvent.data.source === IncrementalSource.Scroll
             ) {
-              // Update the last scroll step
-              (lastStep as ScrollStep).scrollX = scrollData.x;
-              (lastStep as ScrollStep).scrollY = scrollData.y;
-              lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp
-              // URL should already be set from the first event in the sequence
-            } else {
-              // Add a new scroll step
-              const newStep: ScrollStep = {
-                type: "scroll",
+              const scrollData = rrEvent.data as { id: number; x: number; y: number };
+              const currentTabInfo = tabInfo[rrEvent.tabId];
+              const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
+              if (
+                lastStep &&
+                lastStep.type === "scroll" &&
+                lastStep.tabId === rrEvent.tabId &&
+                (lastStep as ScrollStep).targetId === scrollData.id
+              ) {
+                (lastStep as ScrollStep).scrollX = scrollData.x;
+                (lastStep as ScrollStep).scrollY = scrollData.y;
+                lastStep.timestamp = rrEvent.timestamp;
+              } else {
+                const scrollStep: ScrollStep = {
+                  type: "scroll",
+                  timestamp: rrEvent.timestamp,
+                  tabId: rrEvent.tabId,
+                  targetId: scrollData.id,
+                  scrollX: scrollData.x,
+                  scrollY: scrollData.y,
+                  url: currentTabInfo?.url,
+                };
+                steps.push(scrollStep);
+              }
+            } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) {
+              const metaData = rrEvent.data as { href: string };
+              const nav: NavigationStep = {
+                type: "navigation",
                 timestamp: rrEvent.timestamp,
                 tabId: rrEvent.tabId,
-                targetId: scrollData.id,
-                scrollX: scrollData.x,
-                scrollY: scrollData.y,
-                url: currentTabInfo?.url, // Add URL if available
+                url: metaData.href,
               };
-              steps.push(newStep);
+              steps.push(nav);
             }
-          } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) {
-            // Also handle rrweb meta events as navigation
-            const metaData = rrEvent.data as { href: string };
-            const step: NavigationStep = {
-              type: "navigation",
-              timestamp: rrEvent.timestamp,
-              tabId: rrEvent.tabId,
-              url: metaData.href,
-            };
-            steps.push(step);
-          }
           break;
         }
-
-        // Add cases for other StoredEvent types to Step types if needed
-        // e.g., CUSTOM_SELECT_EVENT -> SelectStep
-        // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep
-        // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed.
-
         default:
-          // Ignore other event types for now
-          // console.log("Ignoring event type:", event.messageType);
           break;
       }
     }
-
     return steps;
   }
 
@@ -396,6 +429,8 @@ export default defineBackground(() => {
       "CUSTOM_INPUT_EVENT",
       "CUSTOM_SELECT_EVENT",
       "CUSTOM_KEY_EVENT",
+  // Synthetic event we will emit from content script just before an expected new tab open.
+  "PREPARE_NEW_TAB",
     ];
     if (
       message.type === "RRWEB_EVENT" ||
@@ -412,6 +447,13 @@ export default defineBackground(() => {
       const tabId = sender.tab.id;
       const isCustomEvent = customEventTypes.includes(message.type);
 
+      // Record intent for new tab opening to correlate with onCreated event.
+      if (message.type === "PREPARE_NEW_TAB") {
+        recentNewTabIntents[sender.tab.id] = Date.now();
+        // We do not store this as a workflow step; it's only heuristic metadata.
+        return false;
+      }
+
       // Function to store the event
       const storeEvent = (eventPayload: any, screenshotDataUrl?: string) => {
         if (!sessionLogs[tabId]) {
diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts
index 8f7f233f..a1a60231 100644
--- a/extension/src/entrypoints/content.ts
+++ b/extension/src/entrypoints/content.ts
@@ -240,6 +240,25 @@ function handleCustomClick(event: MouseEvent) {
   const targetElement = event.target as HTMLElement;
   if (!targetElement) return;
 
+  try {
+    // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation.
+    // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener.
+    const isMiddle = event.button === 1;
+    const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux)
+    let anchorOpensNew = false;
+    if (targetElement instanceof HTMLAnchorElement) {
+      const a = targetElement as HTMLAnchorElement;
+      anchorOpensNew =
+        (a.target && a.target.toLowerCase() === "_blank") ||
+        a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer");
+    }
+    if (isMiddle || isModifier || anchorOpensNew) {
+      chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } });
+    }
+  } catch (e) {
+    console.warn("Failed to evaluate new tab heuristic", e);
+  }
+
   try {
     const xpath = getXPath(targetElement);
     const clickData = {
@@ -263,12 +282,17 @@ function handleCustomClick(event: MouseEvent) {
 // --- End Custom Click Handler ---
 
 // --- Custom Input Handler ---
+// Maintain last recorded value & timestamp per element (keyed by xpath) to suppress noisy repeats
+const lastInputRecord: Record<string, { value: string; ts: number }> = {};
 function handleInput(event: Event) {
   if (!isRecordingActive) return;
   const targetElement = event.target as HTMLInputElement | HTMLTextAreaElement;
   if (!targetElement || !("value" in targetElement)) return;
   const isPassword = targetElement.type === "password";
 
+  // Ignore programmatic (non user-trusted) input events – these often cause massive duplication
+  if (!(event as InputEvent).isTrusted) return;
+
   try {
     const xpath = getXPath(targetElement);
     const inputData = {
@@ -280,6 +304,26 @@ function handleInput(event: Event) {
       elementTag: targetElement.tagName,
       value: isPassword ? "********" : targetElement.value,
     };
+
+    // Dedupe rule 1: If value unchanged for this element and within debounce window, skip
+    const DEBOUNCE_MS_INPUT = 1500;
+    const prev = lastInputRecord[xpath];
+    if (prev && prev.value === inputData.value && inputData.timestamp - prev.ts < DEBOUNCE_MS_INPUT) {
+      return; // Suppress noisy duplicate
+    }
+
+    // Dedupe rule 2: If value is empty string and we already recorded empty in last 5s, suppress further empties
+    if (
+      inputData.value === "" &&
+      prev &&
+      prev.value === "" &&
+      inputData.timestamp - prev.ts < 5000
+    ) {
+      return;
+    }
+
+    // Store/update last record metadata
+    lastInputRecord[xpath] = { value: inputData.value, ts: inputData.timestamp };
     console.log("Sending CUSTOM_INPUT_EVENT:", inputData);
     chrome.runtime.sendMessage({
       type: "CUSTOM_INPUT_EVENT",

From c9d6dec933e45cb2a5dec4330cfaa9379b48f601 Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Mon, 11 Aug 2025 03:11:20 +0530
Subject: [PATCH 02/13] Does no t record duplicate steps just because of
 different mouse positions

---
 extension/src/entrypoints/background.ts | 155 ++++++++++++++++++------
 1 file changed, 119 insertions(+), 36 deletions(-)

diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts
index 50c05309..ad039b04 100644
--- a/extension/src/entrypoints/background.ts
+++ b/extension/src/entrypoints/background.ts
@@ -79,13 +79,72 @@ export default defineBackground(() => {
   // Function to broadcast workflow data updates to the console bus
   async function broadcastWorkflowDataUpdate(): Promise<Workflow> {
     // console.log("[DEBUG] broadcastWorkflowDataUpdate: Entered function"); // Optional: Keep for debugging
-    const allSteps: Step[] = Object.keys(sessionLogs)
+    const rawSteps: Step[] = Object.keys(sessionLogs)
       .flatMap((tabIdStr) => {
         const tabId = parseInt(tabIdStr, 10);
         return convertStoredEventsToSteps(sessionLogs[tabId] || []);
       })
       .sort((a, b) => a.timestamp - b.timestamp); // Sort chronologically
 
+    // Post-process to collapse consecutive duplicates that only differ by timestamp (e.g. repeated identical navigations)
+    const allSteps: Step[] = [];
+    for (const step of rawSteps) {
+      const last = allSteps.length ? allSteps[allSteps.length - 1] : null;
+      if (!last) {
+        allSteps.push(step);
+        continue;
+      }
+      let isDuplicate = false;
+      if (last.type === step.type) {
+        switch (step.type) {
+          case 'navigation':
+            isDuplicate = (last as NavigationStep).url === (step as NavigationStep).url && last.tabId === step.tabId;
+            break;
+          case 'input':
+            isDuplicate =
+              last.tabId === step.tabId &&
+              (last as any).url === (step as any).url &&
+              (last as any).frameUrl === (step as any).frameUrl &&
+              (last as any).xpath === (step as any).xpath &&
+              (last as any).elementTag === (step as any).elementTag &&
+              (last as any).value === (step as any).value;
+            break;
+          case 'click':
+            isDuplicate =
+              last.tabId === step.tabId &&
+              (last as any).url === (step as any).url &&
+              (last as any).frameUrl === (step as any).frameUrl &&
+              (last as any).xpath === (step as any).xpath &&
+              (last as any).elementTag === (step as any).elementTag &&
+              (last as any).elementText === (step as any).elementText;
+            break;
+          case 'scroll':
+            isDuplicate =
+              last.tabId === step.tabId &&
+              (last as any).targetId === (step as any).targetId &&
+              (last as any).scrollX === (step as any).scrollX &&
+              (last as any).scrollY === (step as any).scrollY;
+            break;
+          case 'key_press':
+            isDuplicate =
+              last.tabId === step.tabId &&
+              (last as any).url === (step as any).url &&
+              (last as any).key === (step as any).key &&
+              (last as any).xpath === (step as any).xpath;
+            break;
+        }
+      }
+      if (isDuplicate) {
+        // Update timestamp (and screenshot if present) to most recent but don't add new step
+        last.timestamp = step.timestamp;
+        if ((step as any).screenshot) {
+          (last as any).screenshot = (step as any).screenshot;
+        }
+      } else {
+        allSteps.push(step);
+      }
+    }
+
     // Create the workflowData object *after* sorting steps, but hash only steps
     const workflowData: Workflow = {
       name: "Recorded Workflow",
@@ -344,19 +403,22 @@ export default defineBackground(() => {
               lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value };
             }
           } else {
-            console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent);
+            console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent);
           }
           break;
         }
+
         case "CUSTOM_KEY_EVENT": {
           const keyEvent = event as StoredCustomKeyEvent;
+          // Key press might not always have a target element (xpath, etc.)
+          // but needs at least url and key
           if (keyEvent.url && keyEvent.key) {
             const step: KeyPressStep = {
               type: "key_press",
               timestamp: keyEvent.timestamp,
               tabId: keyEvent.tabId,
               url: keyEvent.url,
-              frameUrl: keyEvent.frameUrl,
+              frameUrl: keyEvent.frameUrl, // Can be missing
               key: keyEvent.key,
               xpath: keyEvent.xpath,
               cssSelector: keyEvent.cssSelector,
@@ -365,56 +427,77 @@ export default defineBackground(() => {
             };
             steps.push(step);
           } else {
-            console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent);
+            console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent);
           }
           break;
         }
+
         case "RRWEB_EVENT": {
+          // We only care about scroll events from rrweb for now
           const rrEvent = event as StoredRrwebEvent;
+          if (
+            rrEvent.type === EventType.IncrementalSnapshot &&
+            rrEvent.data.source === IncrementalSource.Scroll
+          ) {
+            const scrollData = rrEvent.data as {
+              id: number;
+              x: number;
+              y: number;
+            }; // Type assertion for clarity
+            const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL
+
+            // Check if the last step added was a mergeable scroll event
+            const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
             if (
-              rrEvent.type === EventType.IncrementalSnapshot &&
-              rrEvent.data.source === IncrementalSource.Scroll
+              lastStep &&
+              lastStep.type === "scroll" &&
+              lastStep.tabId === rrEvent.tabId &&
+              (lastStep as ScrollStep).targetId === scrollData.id
             ) {
-              const scrollData = rrEvent.data as { id: number; x: number; y: number };
-              const currentTabInfo = tabInfo[rrEvent.tabId];
-              const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
-              if (
-                lastStep &&
-                lastStep.type === "scroll" &&
-                lastStep.tabId === rrEvent.tabId &&
-                (lastStep as ScrollStep).targetId === scrollData.id
-              ) {
-                (lastStep as ScrollStep).scrollX = scrollData.x;
-                (lastStep as ScrollStep).scrollY = scrollData.y;
-                lastStep.timestamp = rrEvent.timestamp;
-              } else {
-                const scrollStep: ScrollStep = {
-                  type: "scroll",
-                  timestamp: rrEvent.timestamp,
-                  tabId: rrEvent.tabId,
-                  targetId: scrollData.id,
-                  scrollX: scrollData.x,
-                  scrollY: scrollData.y,
-                  url: currentTabInfo?.url,
-                };
-                steps.push(scrollStep);
-              }
-            } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) {
-              const metaData = rrEvent.data as { href: string };
-              const nav: NavigationStep = {
-                type: "navigation",
+              // Update the last scroll step
+              (lastStep as ScrollStep).scrollX = scrollData.x;
+              (lastStep as ScrollStep).scrollY = scrollData.y;
+              lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp
+              // URL should already be set from the first event in the sequence
+            } else {
+              // Add a new scroll step
+              const newStep: ScrollStep = {
+                type: "scroll",
                 timestamp: rrEvent.timestamp,
                 tabId: rrEvent.tabId,
-                url: metaData.href,
+                targetId: scrollData.id,
+                scrollX: scrollData.x,
+                scrollY: scrollData.y,
+                url: currentTabInfo?.url, // Add URL if available
               };
-              steps.push(nav);
+              steps.push(newStep);
             }
+          } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) {
+            // Also handle rrweb meta events as navigation
+            const metaData = rrEvent.data as { href: string };
+            const step: NavigationStep = {
+              type: "navigation",
+              timestamp: rrEvent.timestamp,
+              tabId: rrEvent.tabId,
+              url: metaData.href,
+            };
+            steps.push(step);
+          }
           break;
         }
+
+        // Add cases for other StoredEvent types to Step types if needed
+        // e.g., CUSTOM_SELECT_EVENT -> SelectStep
+        // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep
+        // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed.
+
         default:
+          // Ignore other event types for now
+          // console.log("Ignoring event type:", event.messageType);
           break;
       }
     }
+
     return steps;
   }
 

From cd9ebc7d7538ab9e8e7ad682c5be116e8c56a11e Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Mon, 11 Aug 2025 04:05:48 +0530
Subject: [PATCH 03/13] works but ui needs to change

---
 extension/src/entrypoints/background.ts       | 79 ++++++++++++++++
 extension/src/entrypoints/content.ts          | 94 ++++++++++++-------
 extension/src/entrypoints/options.html        | 80 ++++++++++++++++
 .../sidepanel/components/recording-view.tsx   | 40 +++++++-
 extension/src/lib/types.ts                    |  5 +
 extension/src/lib/workflow-types.ts           |  5 +
 extension/wxt.config.ts                       | 12 ++-
 workflows/workflow_use/recorder/service.py    | 27 +++++-
 8 files changed, 298 insertions(+), 44 deletions(-)
 create mode 100644 extension/src/entrypoints/options.html

diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts
index ad039b04..6e1ae033 100644
--- a/extension/src/entrypoints/background.ts
+++ b/extension/src/entrypoints/background.ts
@@ -39,6 +39,24 @@ export default defineBackground(() => {
   // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate
   // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated.
   const recentNewTabIntents: { [openerTabId: number]: number } = {};
+  // Record iframe URLs that the user actually interacted with (via custom events) per tab
+  const interactedFrameUrls: Record<number, Set<string>> = {};
+  // Additionally track last interaction time per frame for time-window gating
+  const interactedFrameTimes: Record<number, Record<string, number>> = {};
+  // Hostname patterns for iframe navigation noise we want to suppress
+  const BLOCKED_IFRAME_HOST_PATTERNS: RegExp[] = [
+    /doubleclick\.net$/i,
+    /googlesyndication\.com$/i,
+    /googleadservices\.com$/i,
+    /amazon-adsystem\.com$/i,
+    /recaptcha\.google\.com$/i,
+    /recaptcha\.net$/i,
+    /googletagmanager\.com$/i,
+    /indexww\.com$/i,
+    /adtrafficquality\.google$/i,
+    /2mdn\.net$/i,
+    /gstaticadssl\.googleapis\.com$/i,
+  ];
 
   // Heuristic window (ms) within which a created tab following a user intent is considered relevant.
   const NEW_TAB_INTENT_WINDOW_MS = 4000;
@@ -301,6 +319,25 @@ export default defineBackground(() => {
 
   // --- Conversion Function ---
 
+  const DEFAULT_SETTINGS = {
+    enableIframes: true as boolean,
+    iframeWindow: 3000 as number,
+    blocklist: [
+      'doubleclick.net','googlesyndication.com','googleadservices.com',
+      'amazon-adsystem.com','2mdn.net','recaptcha.google.com','recaptcha.net',
+      'googletagmanager.com','indexww.com','adtrafficquality.google'
+    ] as string[],
+    allowlist: [] as string[],
+  };
+  let settings: { enableIframes: boolean; iframeWindow: number; blocklist: string[]; allowlist: string[] } = { ...DEFAULT_SETTINGS };
+  chrome.storage.sync.get(DEFAULT_SETTINGS, (s: any) => { settings = { ...settings, ...s }; });
+  chrome.storage.onChanged.addListener((changes, area) => {
+    if (area !== 'sync') return;
+    const next = { ...settings } as any;
+    for (const k of Object.keys(changes)) (next as any)[k] = (changes as any)[k].newValue;
+    settings = next;
+  });
+
   function convertStoredEventsToSteps(events: StoredEvent[]): Step[] {
     const steps: Step[] = [];
     const lastNavigationIndexByTab: Record<number, number> = {};
@@ -347,6 +384,7 @@ export default defineBackground(() => {
               tabId: click.tabId,
               url: click.url,
               frameUrl: click.frameUrl,
+              frameIdPath: (click as any).frameIdPath,
               xpath: click.xpath,
               cssSelector: click.cssSelector,
               elementTag: click.elementTag,
@@ -393,6 +431,7 @@ export default defineBackground(() => {
                 tabId: inputEvent.tabId,
                 url: inputEvent.url,
                 frameUrl: inputEvent.frameUrl,
+                frameIdPath: (inputEvent as any).frameIdPath,
                 xpath: inputEvent.xpath,
                 cssSelector: inputEvent.cssSelector,
                 elementTag: inputEvent.elementTag,
@@ -419,6 +458,7 @@ export default defineBackground(() => {
               tabId: keyEvent.tabId,
               url: keyEvent.url,
               frameUrl: keyEvent.frameUrl, // Can be missing
+              frameIdPath: (keyEvent as any).frameIdPath,
               key: keyEvent.key,
               xpath: keyEvent.xpath,
               cssSelector: keyEvent.cssSelector,
@@ -475,11 +515,43 @@ export default defineBackground(() => {
           } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) {
             // Also handle rrweb meta events as navigation
             const metaData = rrEvent.data as { href: string };
+            const href = metaData.href;
+            // Drop about:blank always
+            if (href === 'about:blank') {
+              break;
+            }
+            try {
+              const urlObj = new URL(href);
+              const host = urlObj.hostname;
+              // Allowlist overrides blocklist
+              const inAllow = settings.allowlist.some(d => host.endsWith(d));
+              const inBlock = settings.blocklist.some(d => host.endsWith(d));
+              if (!inAllow && inBlock) {
+                break;
+              }
+              if (!settings.enableIframes && !(rrEvent as any).isTopFrame) {
+                break; // user disabled iframe recording
+              }
+              // If top frame, allow
+              if ((rrEvent as any).isTopFrame) {
+                // allowed
+              } else {
+                const fUrl = (rrEvent as any).frameUrl as string | undefined;
+                if (!fUrl) break;
+                const times = interactedFrameTimes[rrEvent.tabId] || {};
+                const lastTs = times[fUrl];
+                if (!lastTs) break;
+                if (Date.now() - lastTs > settings.iframeWindow) break;
+              }
+            } catch {
+              break;
+            }
             const step: NavigationStep = {
               type: "navigation",
               timestamp: rrEvent.timestamp,
               tabId: rrEvent.tabId,
               url: metaData.href,
+              // frameIdPath could be attached if needed
             };
             steps.push(step);
           }
@@ -559,6 +631,13 @@ export default defineBackground(() => {
           screenshot: screenshotDataUrl,
         };
         sessionLogs[tabId].push(eventWithMeta);
+        // Mark frame as interacted so subsequent iframe meta navigations can be allowed
+        if (message.type.startsWith("CUSTOM_") && eventPayload.frameUrl) {
+          if (!interactedFrameUrls[tabId]) interactedFrameUrls[tabId] = new Set();
+          interactedFrameUrls[tabId].add(eventPayload.frameUrl);
+          if (!interactedFrameTimes[tabId]) interactedFrameTimes[tabId] = {};
+          interactedFrameTimes[tabId][eventPayload.frameUrl] = Date.now();
+        }
         broadcastWorkflowDataUpdate(); // Call is async, will not block
         // console.log(`Stored ${message.type} from tab ${tabId}`);
       };
diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts
index a1a60231..a646562f 100644
--- a/extension/src/entrypoints/content.ts
+++ b/extension/src/entrypoints/content.ts
@@ -122,6 +122,16 @@ function startRecorder() {
     emit(event) {
       if (!isRecordingActive) return;
 
+      const frameUrl = window.location.href;
+      const isTopFrame = window.self === window.top;
+      const frameIdPath = (() => {
+        try {
+          let win: any = window; const parts: number[] = [];
+          while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i<parent.frames.length;i++){ if(parent.frames[i]===win){idx=i;break;} } parts.unshift(idx); win=parent; if(parts.length>10) break; }
+          return parts.length ? parts.join('.') : '0';
+        } catch { return '0'; }
+      })();
+
       // Handle scroll events with debouncing and direction detection
       if (
         event.type === EventType.IncrementalSnapshot &&
@@ -157,7 +167,10 @@ function startRecorder() {
             type: "RRWEB_EVENT",
             payload: {
               ...event,
-              data: roundedScrollData, // Use rounded coordinates
+              data: roundedScrollData,
+              frameUrl,
+              frameIdPath,
+              isTopFrame,
             },
           });
           lastDirection = currentDirection;
@@ -178,15 +191,18 @@ function startRecorder() {
             type: "RRWEB_EVENT",
             payload: {
               ...event,
-              data: roundedScrollData, // Use rounded coordinates
+              data: roundedScrollData,
+              frameUrl,
+              frameIdPath,
+              isTopFrame,
             },
           });
           scrollTimeout = null;
           lastDirection = null; // Reset direction for next scroll
         }, DEBOUNCE_MS);
       } else {
-        // Pass through non-scroll events unchanged
-        chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: event });
+        // Pass through non-scroll events unchanged, but include frame context for filtering in background
+        chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: { ...event, frameUrl, frameIdPath, isTopFrame } });
       }
     },
     maskInputOptions: {
@@ -239,45 +255,38 @@ function handleCustomClick(event: MouseEvent) {
   if (!isRecordingActive) return;
   const targetElement = event.target as HTMLElement;
   if (!targetElement) return;
-
-  try {
-    // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation.
-    // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener.
-    const isMiddle = event.button === 1;
-    const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux)
-    let anchorOpensNew = false;
-    if (targetElement instanceof HTMLAnchorElement) {
-      const a = targetElement as HTMLAnchorElement;
-      anchorOpensNew =
-        (a.target && a.target.toLowerCase() === "_blank") ||
-        a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer");
-    }
-    if (isMiddle || isModifier || anchorOpensNew) {
-      chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } });
-    }
-  } catch (e) {
-    console.warn("Failed to evaluate new tab heuristic", e);
-  }
-
+  // Determine a frame identifier (best-effort). Top frame = 0, nested frames build path.
+  const frameIdPath = (() => {
+    try {
+      let win: any = window;
+      const parts: number[] = [];
+      while (win !== win.parent) {
+        const parent = win.parent;
+        let index = 0;
+        for (let i = 0; i < parent.frames.length; i++) {
+          if (parent.frames[i] === win) { index = i; break; }
+        }
+        parts.unshift(index);
+        win = parent;
+        if (parts.length > 10) break; // safety
+      }
+      return parts.length ? parts.join('.') : '0';
+    } catch { return '0'; }
+  })();
   try {
     const xpath = getXPath(targetElement);
     const clickData = {
       timestamp: Date.now(),
-      url: document.location.href, // Use document.location for main page URL
-      frameUrl: window.location.href, // URL of the frame where the event occurred
-      xpath: xpath,
+      url: document.location.href,
+      frameUrl: window.location.href,
+      frameIdPath,
+      xpath,
       cssSelector: getEnhancedCSSSelector(targetElement, xpath),
       elementTag: targetElement.tagName,
       elementText: targetElement.textContent?.trim().slice(0, 200) || "",
     };
-    console.log("Sending CUSTOM_CLICK_EVENT:", clickData);
-    chrome.runtime.sendMessage({
-      type: "CUSTOM_CLICK_EVENT",
-      payload: clickData,
-    });
-  } catch (error) {
-    console.error("Error capturing click data:", error);
-  }
+    chrome.runtime.sendMessage({ type: "CUSTOM_CLICK_EVENT", payload: clickData });
+  } catch (error) { console.error("Error capturing click data:", error); }
 }
 // --- End Custom Click Handler ---
 
@@ -293,12 +302,20 @@ function handleInput(event: Event) {
   // Ignore programmatic (non user-trusted) input events – these often cause massive duplication
   if (!(event as InputEvent).isTrusted) return;
 
+  const frameIdPath = (() => {
+    try {
+      let win: any = window; const parts: number[] = [];
+      while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i<parent.frames.length;i++){ if(parent.frames[i]===win){idx=i;break;} } parts.unshift(idx); win=parent; if(parts.length>10) break; }
+      return parts.length ? parts.join('.') : '0';
+    } catch { return '0'; }
+  })();
   try {
     const xpath = getXPath(targetElement);
     const inputData = {
       timestamp: Date.now(),
       url: document.location.href,
       frameUrl: window.location.href,
+      frameIdPath,
       xpath: xpath,
       cssSelector: getEnhancedCSSSelector(targetElement, xpath),
       elementTag: targetElement.tagName,
@@ -341,6 +358,7 @@ function handleSelectChange(event: Event) {
   const targetElement = event.target as HTMLSelectElement;
   // Ensure it's a select element
   if (!targetElement || targetElement.tagName !== "SELECT") return;
+  const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i<parent.frames.length;i++){ if(parent.frames[i]===win){idx=i;break;} } parts.unshift(idx); win=parent; if(parts.length>10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })();
 
   try {
     const xpath = getXPath(targetElement);
@@ -349,6 +367,7 @@ function handleSelectChange(event: Event) {
       timestamp: Date.now(),
       url: document.location.href,
       frameUrl: window.location.href,
+      frameIdPath,
       xpath: xpath,
       cssSelector: getEnhancedCSSSelector(targetElement, xpath),
       elementTag: targetElement.tagName,
@@ -421,11 +440,13 @@ function handleKeydown(event: KeyboardEvent) {
       }
     }
 
+    const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i<parent.frames.length;i++){ if(parent.frames[i]===win){idx=i;break;} } parts.unshift(idx); win=parent; if(parts.length>10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })();
     try {
       const keyData = {
         timestamp: Date.now(),
         url: document.location.href,
         frameUrl: window.location.href,
+        frameIdPath,
         key: keyToLog, // The key or combination pressed
         xpath: xpath, // XPath of the element in focus (if any)
         cssSelector: cssSelector, // CSS selector of the element in focus (if any)
@@ -588,6 +609,9 @@ function handleBlur(event: FocusEvent) {
 
 export default defineContentScript({
   matches: ["<all_urls>"],
+  // Ensure injection into all frames (iframes) so we can capture interactions inside nested documents.
+  allFrames: true,
+  matchAboutBlank: true,
   main(ctx) {
     // Listener for status updates from the background script
     chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
diff --git a/extension/src/entrypoints/options.html b/extension/src/entrypoints/options.html
new file mode 100644
index 00000000..dfb7bf49
--- /dev/null
+++ b/extension/src/entrypoints/options.html
@@ -0,0 +1,80 @@
+<!doctype html>
+<html>
+  <head>
+    <meta charset="UTF-8" />
+    <title>Workflow Use - Options</title>
+    <style>
+      body { font-family: system-ui, sans-serif; margin: 16px; }
+      label { display: block; margin: 8px 0 4px; font-weight: 600; }
+      textarea { width: 100%; height: 100px; font-family: ui-monospace, monospace; }
+      input[type="number"] { width: 160px; }
+      .row { margin-bottom: 12px; }
+      .hint { color: #555; font-size: 12px; }
+      .section { border: 1px solid #ddd; padding: 12px; border-radius: 8px; margin-bottom: 16px; }
+    </style>
+  </head>
+  <body>
+    <h2>Recording Settings</h2>
+    <div class="section">
+      <div class="row">
+        <label>
+          <input type="checkbox" id="enableIframes" /> Enable recording from iframes
+        </label>
+        <div class="hint">When disabled, iframe-originated navigation/meta events are ignored.</div>
+      </div>
+      <div class="row">
+        <label for="iframeWindow">Iframe allow window (ms)</label>
+        <input type="number" id="iframeWindow" min="0" step="100" />
+        <div class="hint">Time after a user interaction in an iframe during which rrweb meta navigations are allowed.</div>
+      </div>
+      <div class="row">
+        <label for="blocklist">Blocked domains (newline separated)</label>
+        <textarea id="blocklist" placeholder="example.com\nads.example.org"></textarea>
+      </div>
+      <div class="row">
+        <label for="allowlist">Allowed domains (newline separated)</label>
+        <textarea id="allowlist" placeholder="Optional allowlist overrides blocklist"></textarea>
+      </div>
+      <button id="save">Save</button>
+      <span id="status" class="hint"></span>
+    </div>
+
+    <script>
+      const DEFAULTS = {
+        enableIframes: true,
+        iframeWindow: 3000,
+        blocklist: [
+          'doubleclick.net','googlesyndication.com','googleadservices.com',
+          'amazon-adsystem.com','2mdn.net','recaptcha.google.com','recaptcha.net',
+          'googletagmanager.com','indexww.com','adtrafficquality.google'
+        ],
+        allowlist: [],
+      };
+
+      function toLines(str){ return (str||'').split(/\r?\n/).map(s=>s.trim()).filter(Boolean); }
+      function fromLines(arr){ return (arr||[]).join('\n'); }
+
+      async function load() {
+        const store = await chrome.storage.sync.get(DEFAULTS);
+        document.getElementById('enableIframes').checked = !!store.enableIframes;
+        document.getElementById('iframeWindow').value = store.iframeWindow;
+        document.getElementById('blocklist').value = fromLines(store.blocklist);
+        document.getElementById('allowlist').value = fromLines(store.allowlist);
+      }
+
+      async function save() {
+        const enableIframes = document.getElementById('enableIframes').checked;
+        const iframeWindow = parseInt(document.getElementById('iframeWindow').value || '0', 10);
+        const blocklist = toLines(document.getElementById('blocklist').value);
+        const allowlist = toLines(document.getElementById('allowlist').value);
+        await chrome.storage.sync.set({ enableIframes, iframeWindow, blocklist, allowlist });
+        const el = document.getElementById('status');
+        el.textContent = 'Saved';
+        setTimeout(()=> el.textContent = '', 1500);
+      }
+
+      document.getElementById('save').addEventListener('click', save);
+      load();
+    </script>
+  </body>
+</html>
diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx
index 0fa3456a..0aa09f33 100644
--- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx
+++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx
@@ -1,12 +1,25 @@
 import React from "react";
 import { useWorkflow } from "../context/workflow-provider";
 import { Button } from "@/components/ui/button";
-import { EventViewer } from "./event-viewer"; // Import EventViewer
+import { EventViewer } from "./event-viewer";
 
 export const RecordingView: React.FC = () => {
   const { stopRecording, workflow } = useWorkflow();
   const stepCount = workflow?.steps?.length || 0;
 
+  const openOptions = () => chrome.runtime.openOptionsPage();
+  const [enableIframes, setEnableIframes] = React.useState(true);
+  React.useEffect(() => {
+    chrome.storage.sync.get({ enableIframes: true }, (s) =>
+      setEnableIframes(!!s.enableIframes)
+    );
+  }, []);
+  const toggleIframes = async () => {
+    const next = !enableIframes;
+    setEnableIframes(next);
+    await chrome.storage.sync.set({ enableIframes: next });
+  };
+
   return (
     <div className="flex flex-col h-full">
       <div className="flex items-center justify-between p-4 border-b border-border">
@@ -19,12 +32,29 @@ export const RecordingView: React.FC = () => {
             Recording ({stepCount} steps)
           </span>
         </div>
-        <Button variant="destructive" size="sm" onClick={stopRecording}>
-          Stop Recording
-        </Button>
+        <div className="flex items-center gap-2">
+          <Button
+            variant="outline"
+            size="sm"
+            onClick={toggleIframes}
+            title="Toggle iframe recording"
+          >
+            {enableIframes ? "Iframes: On" : "Iframes: Off"}
+          </Button>
+          <Button
+            variant="outline"
+            size="sm"
+            onClick={openOptions}
+            title="Open Options"
+          >
+            Options
+          </Button>
+          <Button variant="destructive" size="sm" onClick={stopRecording}>
+            Stop Recording
+          </Button>
+        </div>
       </div>
       <div className="flex-grow overflow-hidden p-4">
-        {/* EventViewer will now take full available space within this div */}
         <EventViewer />
       </div>
     </div>
diff --git a/extension/src/lib/types.ts b/extension/src/lib/types.ts
index 601d3aac..8cc94232 100644
--- a/extension/src/lib/types.ts
+++ b/extension/src/lib/types.ts
@@ -2,6 +2,7 @@ export interface StoredCustomClickEvent {
   timestamp: number;
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   xpath: string;
   cssSelector?: string;
   elementTag: string;
@@ -15,6 +16,7 @@ export interface StoredCustomInputEvent {
   timestamp: number;
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   xpath: string;
   cssSelector?: string;
   elementTag: string;
@@ -28,6 +30,7 @@ export interface StoredCustomSelectEvent {
   timestamp: number;
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   xpath: string;
   cssSelector?: string;
   elementTag: string;
@@ -42,6 +45,7 @@ export interface StoredCustomKeyEvent {
   timestamp: number;
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   key: string;
   xpath?: string; // XPath of focused element
   cssSelector?: string;
@@ -73,6 +77,7 @@ export interface StoredRrwebEvent {
   data: any;
   timestamp: number;
   tabId: number;
+  frameUrl?: string; // URL of the frame where the rrweb event originated (when injected per-frame)
   messageType: "RRWEB_EVENT";
 }
 
diff --git a/extension/src/lib/workflow-types.ts b/extension/src/lib/workflow-types.ts
index 57d4470f..35605bd6 100644
--- a/extension/src/lib/workflow-types.ts
+++ b/extension/src/lib/workflow-types.ts
@@ -27,12 +27,14 @@ export interface NavigationStep extends BaseStep {
   type: "navigation";
   url: string; // Navigation implies a URL change
   screenshot?: string; // Optional in source
+  frameIdPath?: string; // Optional frame context
 }
 
 export interface ClickStep extends BaseStep {
   type: "click";
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   xpath: string;
   cssSelector?: string; // Optional in source
   elementTag: string;
@@ -44,6 +46,7 @@ export interface InputStep extends BaseStep {
   type: "input";
   url: string;
   frameUrl: string;
+  frameIdPath?: string;
   xpath: string;
   cssSelector?: string; // Optional in source
   elementTag: string;
@@ -55,6 +58,7 @@ export interface KeyPressStep extends BaseStep {
   type: "key_press";
   url?: string; // Can be missing if key press happens without element focus? Source is optional.
   frameUrl?: string; // Might be missing
+  frameIdPath?: string;
   key: string;
   xpath?: string; // Optional in source
   cssSelector?: string; // Optional in source
@@ -67,6 +71,7 @@ export interface ScrollStep extends BaseStep {
   targetId: number; // The rrweb ID of the element being scrolled
   scrollX: number;
   scrollY: number;
+  frameIdPath?: string;
   // Note: url might be missing if scroll happens on initial load before meta event?
 }
 
diff --git a/extension/wxt.config.ts b/extension/wxt.config.ts
index 9d7ae4ec..83bf64d8 100644
--- a/extension/wxt.config.ts
+++ b/extension/wxt.config.ts
@@ -13,9 +13,15 @@ export default defineConfig({
       // WXT-specific overrides (optional)
     }),
   manifest: {
-    permissions: ["tabs", "sidePanel", "<all_urls>"],
-    host_permissions: ["http://127.0.0.1/*"],
-    // options_page: "options.html",
+    permissions: ["tabs", "sidePanel", "storage", "<all_urls>"],
+    // Broaden host permissions so content script can inject into iframes on external sites.
+    // Note: <all_urls> in permissions allows some access, but host_permissions explicitly grants injection rights.
+    host_permissions: [
+      "http://127.0.0.1/*",
+      "https://*/*",
+      "http://*/*"
+    ],
+    options_page: "options.html",
     // action: {
     //   default_popup: "popup.html",
     // },
diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py
index fb154ba9..5de21556 100644
--- a/workflows/workflow_use/recorder/service.py
+++ b/workflows/workflow_use/recorder/service.py
@@ -85,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 		async with self.final_workflow_processed_lock:
 			if not self.final_workflow_processed_flag and self.last_workflow_update_event:
 				print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).')
-				self.final_workflow_output = self.last_workflow_update_event.payload
+				wf = self.last_workflow_update_event.payload
+				# Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations
+				try:
+					clean_steps = []
+					for s in wf.steps:
+						st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None)
+						url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None)
+						if st == 'navigation':
+							if not url or url == 'about:blank':
+								continue
+							from urllib.parse import urlparse
+							host = urlparse(url).hostname or ''
+							blocked = any(
+								pat in host for pat in (
+									'doubleclick.net', 'googlesyndication.com', 'googleadservices.com',
+									'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net',
+									'googletagmanager.com', 'indexww.com', 'adtrafficquality.google'
+								)
+							)
+							if blocked:
+								continue
+						clean_steps.append(s)
+					wf.steps = clean_steps
+				except Exception as e:
+					print(f'[Service] Backend filter failed: {e}')
+				self.final_workflow_output = wf
 				self.final_workflow_processed_flag = True
 				processed_this_call = True
 

From cb846efe170a7c4da47a9d9051506b827de1442d Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Mon, 11 Aug 2025 04:23:07 +0530
Subject: [PATCH 04/13] final recording steps completed frontend side

---
 extension/src/entrypoints/background.ts       | 23 ++++++++++-----
 .../sidepanel/components/recording-view.tsx   | 29 -------------------
 2 files changed, 15 insertions(+), 37 deletions(-)

diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts
index 6e1ae033..bf8d49ea 100644
--- a/extension/src/entrypoints/background.ts
+++ b/extension/src/entrypoints/background.ts
@@ -136,13 +136,13 @@ export default defineBackground(() => {
               (last as any).elementTag === (step as any).elementTag &&
               (last as any).elementText === (step as any).elementText;
             break;
-          case 'scroll':
-            isDuplicate =
-              last.tabId === step.tabId &&
-              (last as any).targetId === (step as any).targetId &&
-              (last as any).scrollX === (step as any).scrollX &&
-              (last as any).scrollY === (step as any).scrollY;
+          case 'scroll': {
+            const sameXY = (last as any).scrollX === (step as any).scrollX && (last as any).scrollY === (step as any).scrollY;
+            const sameUrl = (last as any).url === (step as any).url;
+            const nearTime = Math.abs(step.timestamp - last.timestamp) < 200;
+            isDuplicate = last.tabId === step.tabId && sameXY && sameUrl && nearTime;
             break;
+          }
           case 'key_press':
             isDuplicate =
               last.tabId === step.tabId &&
@@ -485,14 +485,21 @@ export default defineBackground(() => {
               y: number;
             }; // Type assertion for clarity
             const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL
-
+            // Drop internal chrome pages like chrome://newtab/
+            if (currentTabInfo?.url?.startsWith('chrome://')) {
+              break;
+            }
             // Check if the last step added was a mergeable scroll event
             const lastStep = steps.length > 0 ? steps[steps.length - 1] : null;
             if (
               lastStep &&
               lastStep.type === "scroll" &&
               lastStep.tabId === rrEvent.tabId &&
-              (lastStep as ScrollStep).targetId === scrollData.id
+              // Treat same XY within a short time window as duplicate, regardless of targetId
+              (lastStep as ScrollStep).scrollX === scrollData.x &&
+              (lastStep as ScrollStep).scrollY === scrollData.y &&
+              Math.abs(rrEvent.timestamp - lastStep.timestamp) < 200 &&
+              (lastStep as any).url === currentTabInfo?.url
             ) {
               // Update the last scroll step
               (lastStep as ScrollStep).scrollX = scrollData.x;
diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx
index 0aa09f33..1aa8a0de 100644
--- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx
+++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx
@@ -7,19 +7,6 @@ export const RecordingView: React.FC = () => {
   const { stopRecording, workflow } = useWorkflow();
   const stepCount = workflow?.steps?.length || 0;
 
-  const openOptions = () => chrome.runtime.openOptionsPage();
-  const [enableIframes, setEnableIframes] = React.useState(true);
-  React.useEffect(() => {
-    chrome.storage.sync.get({ enableIframes: true }, (s) =>
-      setEnableIframes(!!s.enableIframes)
-    );
-  }, []);
-  const toggleIframes = async () => {
-    const next = !enableIframes;
-    setEnableIframes(next);
-    await chrome.storage.sync.set({ enableIframes: next });
-  };
-
   return (
     <div className="flex flex-col h-full">
       <div className="flex items-center justify-between p-4 border-b border-border">
@@ -33,22 +20,6 @@ export const RecordingView: React.FC = () => {
           </span>
         </div>
         <div className="flex items-center gap-2">
-          <Button
-            variant="outline"
-            size="sm"
-            onClick={toggleIframes}
-            title="Toggle iframe recording"
-          >
-            {enableIframes ? "Iframes: On" : "Iframes: Off"}
-          </Button>
-          <Button
-            variant="outline"
-            size="sm"
-            onClick={openOptions}
-            title="Open Options"
-          >
-            Options
-          </Button>
           <Button variant="destructive" size="sm" onClick={stopRecording}>
             Stop Recording
           </Button>

From 1856775978b152928dda3835ad049aba687cb43a Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Mon, 11 Aug 2025 04:40:41 +0530
Subject: [PATCH 05/13] Added interaction with iframe from backend now needs
 tocontain the click in iframe itself

---
 workflows/workflow_use/controller/service.py | 43 +++++++++++++++++++-
 workflows/workflow_use/controller/views.py   |  2 +
 workflows/workflow_use/workflow/service.py   | 19 ++++++---
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 238fcbb5..f4c54d4d 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -20,7 +20,7 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_ACTION_TIMEOUT_MS = 1000
+DEFAULT_ACTION_TIMEOUT_MS = 2500
 
 # List of default actions from browser_use.controller.service.Controller to disable
 # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case)
@@ -83,9 +83,48 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse
 			page = await browser_session.get_current_page()
 			original_selector = params.cssSelector
 
+			# If frameUrl or frameIdPath are provided, narrow the search to that frame
+			def _select_context(pg):
+				try:
+					from playwright.async_api import Page, Frame
+					ctx: Page | Frame = pg
+					if getattr(params, 'frameIdPath', None):
+						parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != '']
+						f = pg.main_frame
+						for seg in parts:
+							idx = int(seg)
+							if idx < len(f.child_frames):
+								f = f.child_frames[idx]
+							else:
+								return ctx
+						ctx = f
+					elif getattr(params, 'frameUrl', None):
+						from urllib.parse import urlparse
+						pf = urlparse(params.frameUrl)
+						for fr in pg.frames:
+							try:
+								ff = urlparse(fr.url)
+								# Match origin, and allow target frameUrl to be a prefix of full URL
+								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl):
+									ctx = fr
+									break
+							except Exception:
+								continue
+				except Exception:
+					ctx = pg
+				return ctx
+
 			try:
+				# If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM)
+				curr = (page.url or '').split('#')[0]
+				tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0]
+				if tgt and tgt.startswith('http') and curr != tgt:
+					await page.goto(tgt)
+					await page.wait_for_load_state()
+
+				ctx = _select_context(page)
 				locator, selector_used = await get_best_element_handle(
-					page,
+					ctx,
 					params.cssSelector,
 					params,
 					timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py
index 8c61470a..ffb25b4a 100644
--- a/workflows/workflow_use/controller/views.py
+++ b/workflows/workflow_use/controller/views.py
@@ -23,6 +23,8 @@ class RecorderBase(StepMeta):
 	elementTag: Optional[str] = None
 	elementText: Optional[str] = None
 	frameUrl: Optional[str] = None
+	frameIdPath: Optional[str] = None
+	url: Optional[str] = None
 	screenshot: Optional[str] = None
 
 
diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py
index aa7008c1..d21da2e8 100644
--- a/workflows/workflow_use/workflow/service.py
+++ b/workflows/workflow_use/workflow/service.py
@@ -144,11 +144,20 @@ def truncate_selector(selector: str) -> str:
 					await self.browser._wait_for_stable_network()
 					page = await self.browser.get_current_page()
 
-					logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}')
-					locator, selector_used = await get_best_element_handle(
-						page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT
-					)
-					logger.info(f'Element with selector found: {truncate_selector(selector_used)}')
+					# If the next step declares a URL/frameUrl and it does not match the current page URL,
+					# skip waiting for its element on the current page (prevents false failures like step 7).
+					curr_url = (page.url or '').split('#')[0]
+					declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0]
+					if declared_next_url and declared_next_url != curr_url:
+						logger.info(
+							f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})"
+						)
+					else:
+						logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}')
+						locator, selector_used = await get_best_element_handle(
+							page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT
+						)
+						logger.info(f'Element with selector found: {truncate_selector(selector_used)}')
 				except Exception as e:
 					logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}')
 					raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e

From 2ccea941462751b0406981ee802c4bb0620ca526 Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Thu, 14 Aug 2025 00:12:17 +0530
Subject: [PATCH 06/13] Backend Perfectly clicks on iframe using iterative
 approach

---
 workflows/workflow_use/controller/service.py | 96 +++++++++++++++-----
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index f4c54d4d..2465d8cc 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -88,50 +88,96 @@ def _select_context(pg):
 				try:
 					from playwright.async_api import Page, Frame
 					ctx: Page | Frame = pg
-					if getattr(params, 'frameIdPath', None):
-						parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != '']
+					# If frame hints point to top document, stay on page
+					fid = getattr(params, 'frameIdPath', None)
+					furl = getattr(params, 'frameUrl', None)
+					curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else ''
+					if furl and curr_url and furl.split('#')[0] == curr_url:
+						return pg
+					if fid:
+						segs = [s for s in str(fid).split('.') if s != '']
+						if all(s == '0' for s in segs):
+							return pg
 						f = pg.main_frame
-						for seg in parts:
-							idx = int(seg)
-							if idx < len(f.child_frames):
+						for s in segs[1:]:  # skip top marker
+							idx = int(s)
+							if 0 <= idx < len(f.child_frames):
 								f = f.child_frames[idx]
 							else:
-								return ctx
-						ctx = f
-					elif getattr(params, 'frameUrl', None):
+								return pg
+						return f
+					if furl:
 						from urllib.parse import urlparse
-						pf = urlparse(params.frameUrl)
+						pf = urlparse(furl)
+						# If frameUrl equals current page URL (origin+path), stay on page
+						try:
+							from urllib.parse import urlparse as _u
+							cu = _u(curr_url)
+							if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path):
+								return pg
+						except Exception:
+							pass
 						for fr in pg.frames:
 							try:
 								ff = urlparse(fr.url)
-								# Match origin, and allow target frameUrl to be a prefix of full URL
-								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl):
-									ctx = fr
-									break
+								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
+									return fr
 							except Exception:
 								continue
 				except Exception:
-					ctx = pg
+					return pg
 				return ctx
 
+			# Fallback: search all frames for selector (prefer frames matching target origin)
+			async def _find_in_frames(pg, selector: str):
+				from urllib.parse import urlparse
+				prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or ''
+				pref_o = urlparse(prefer) if prefer else None
+				frames = list(pg.frames)
+				def score(fr):
+					if not pref_o:
+						return 0
+					try:
+						fo = urlparse(fr.url)
+						return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0
+					except Exception:
+						return 0
+				frames.sort(key=score, reverse=True)
+				for fr in frames:
+					try:
+						loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2))
+						return fr, loc, used
+					except Exception:
+						continue
+				return None, None, None
+
 			try:
-				# If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM)
+				# Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared
 				curr = (page.url or '').split('#')[0]
-				tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0]
-				if tgt and tgt.startswith('http') and curr != tgt:
-					await page.goto(tgt)
+				declared_url = (getattr(params, 'url', None) or '').split('#')[0]
+				has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None))
+				if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url:
+					await page.goto(declared_url)
 					await page.wait_for_load_state()
 
 				ctx = _select_context(page)
-				locator, selector_used = await get_best_element_handle(
-					ctx,
-					params.cssSelector,
-					params,
-					timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
-				)
+				try:
+					locator, selector_used = await get_best_element_handle(
+						ctx,
+						params.cssSelector,
+						params,
+						timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
+					)
+				except Exception:
+					# Fallback: search all frames
+					fr, locator, selector_used = await _find_in_frames(page, params.cssSelector)
+					if locator is None:
+						raise
+
 				await locator.click(force=True)
 
-				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})'
+				used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector
+				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
 			except Exception as e:

From 3ec843fccee7212829a20369a18612b803ab6610 Mon Sep 17 00:00:00 2001
From: Diveyam-Mishra <mishradiveyam@gmail.com>
Date: Thu, 14 Aug 2025 00:45:17 +0530
Subject: [PATCH 07/13] Testing Script Added

---
 .github/copilot-instructions.md      |  62 ++++++++++++++++
 workflows/examples/test_iframes.json | 101 +++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)
 create mode 100644 .github/copilot-instructions.md
 create mode 100644 workflows/examples/test_iframes.json

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 00000000..9cd442d9
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,62 @@
+# Copilot Instructions for workflow-use
+
+These project-specific instructions help AI coding agents work productively in this repo. Keep responses concise, implement with tools when possible, and follow the repo’s patterns.
+
+## Architecture (big picture)
+- Monorepo with three main parts:
+  - `extension/` (WXT MV3 Chrome extension): records user actions via rrweb + custom DOM events and streams to a local server.
+  - `workflows/` (Python package + CLI + FastAPI backend): converts recorded events into deterministic workflow steps, runs/replays steps (fallback to Browser Use/Playwright).
+  - `ui/` (Vite/React frontend): GUI to view and run workflows.
+- Data flow:
+  1) Content scripts capture events → background aggregates → posts `WORKFLOW_UPDATE` to `http://127.0.0.1:7331/event`.
+  2) Backend stores/processes to `workflows/examples/*.json` and executes with Playwright.
+  3) UI/CLI visualize/run.
+- Key types: `extension/src/lib/types.ts` (Stored* events), `extension/src/lib/workflow-types.ts` (Step union), `workflows` Python `Workflow` model.
+
+## Dev workflows
+- Build extension: `cd extension && npm install && npm run build`.
+- Python backend setup: `cd workflows && uv sync && playwright install chromium && cp .env.example .env`.
+- Record: `cd workflows && python cli.py create-workflow` (starts local server; open Chrome with built extension).
+- Run workflow as tool: `python cli.py run-as-tool examples/example.workflow.json --prompt "..."`.
+- Run workflow: `python cli.py run-workflow examples/example.workflow.json`.
+- Launch GUI: `python cli.py launch-gui` (starts FastAPI + UI dev server).
+
+## Extension patterns
+- Use `defineBackground` and `defineContentScript` (WXT). Content script always attaches listeners; background aggregates and emits `WORKFLOW_UPDATE` with a hash to avoid spam.
+- Recording:
+  - rrweb for scroll/meta; custom `CUSTOM_CLICK_EVENT`, `CUSTOM_INPUT_EVENT`, `CUSTOM_KEY_EVENT`, etc.
+  - New-tab intent: content sends `PREPARE_NEW_TAB`; background correlates `tabs.onCreated` and marks `userInitiated`.
+  - Activated tab gating: ignore tabs never activated (reduces ad/tracker noise).
+  - Dedupe: merge consecutive identical steps, collapse rapid empty input bursts, consolidate navigations per tab.
+  - Iframes: content runs with `allFrames: true` and `matchAboutBlank: true`; events carry `frameUrl` and `frameIdPath`. Background only allows rrweb meta navigations from frames the user interacted with and filters ad/analytics hosts.
+
+## Backend patterns
+- Python FastAPI endpoint `http://127.0.0.1:7331/event` receives:
+  - `RECORDING_STARTED/STOPPED`, `WORKFLOW_UPDATE` with `steps` only (hash-based dedupe).
+- CLI: `workflows/cli.py` provides record/run/launch commands; Playwright is used for replay.
+- Keep workflow JSON in `workflows/examples/`. Naming is free-form; version stays at `1.0.0` today.
+
+## Conventions
+- Step schema (extension `workflow-types.ts`): navigation, click, input, key_press, scroll. Prefer merging updates over emitting new steps.
+- Use XPath + enhanced CSS selectors; keep values masked for password inputs.
+- Avoid sending events from tabs not in `activatedTabs` unless `userInitiated`.
+- When adding new event types, extend Stored* in `types.ts`, enrich in content, and map to `Step` in background.
+
+## Gotchas / Tips
+- Avoid noisy iframe navs (recaptcha/ads): rely on `interactedFrameUrls` filtering in background. If adjusting, prefer allow/deny logic over hard-coding hosts in multiple places.
+- When changing extension logic, rebuild with `npm run build`; dev opens side panel on install/update.
+- Screenshot capture only works for visible tabs; background uses `captureVisibleTab` best-effort.
+- If tests are added, ensure they run per package (`extension`, `workflows`, `ui`) rather than at repo root.
+
+## Example tasks for agents
+- Add a new step type (e.g., select):
+  1) Extend `StoredCustomSelectEvent` in `types.ts` and emit in `content.ts`.
+  2) Map to a `SelectStep` in `background.ts` (convertStoredEventsToSteps).
+  3) Update backend replay to handle the new step.
+- Reduce noise further:
+  - Tune debounce windows in content.
+  - Post-process duplicates in `broadcastWorkflowDataUpdate`.
+  - Add frame interaction checks before accepting rrweb meta navigations.
+
+## Security & secrets
+- Do not commit real API keys. `.env.example` exists; load secrets locally. If you see a real key in `workflows/.env`, instruct maintainers to rotate and remove it.
diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json
new file mode 100644
index 00000000..b1834355
--- /dev/null
+++ b/workflows/examples/test_iframes.json
@@ -0,0 +1,101 @@
+{
+  "name": "Recorded Workflow",
+  "description": "Recorded on 8/11/2025, 4:20:47 AM",
+  "version": "1.0.0",
+  "input_schema": [],
+  "steps": [
+    {
+      "type": "navigation",
+      "timestamp": 1754866228439,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "scroll",
+      "timestamp": 1754866228608,
+      "tabId": 388342781,
+      "targetId": 219,
+      "scrollX": 0,
+      "scrollY": 7,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866228634,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866228849,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "id(\"textareawrapper\")/div[1]/div[6]",
+      "cssSelector": "div.CodeMirror-scroll",
+      "elementTag": "DIV",
+      "elementText": "<!DOCTYPE html><html><body>​<h1>The iframe element</h1>​<iframe src=\"https://www.w3schools.com\" title=\"W3Schools Free Online Web Tutorials\"></iframe>​</body></html>​"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866230495,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866231531,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "body/div[2]/div[1]/a[4]",
+      "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]",
+      "elementTag": "A",
+      "elementText": ""
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866237707,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866238574,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/",
+      "frameUrl": "https://www.w3schools.com/",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[3]",
+      "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "JAVASCRIPT"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866242778,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866242882,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/js/default.asp",
+      "frameUrl": "https://www.w3schools.com/js/default.asp",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[1]",
+      "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "HTML"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866246395,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    }
+  ]
+}
\ No newline at end of file

From 0e46a9f234bdaea02edd2ada1584f7a4f2b412be Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <mishradiveyam@gmail.com>
Date: Sat, 25 Oct 2025 23:07:28 +0530
Subject: [PATCH 08/13] feat(workflow): strengthen iframe-aware execution and
 extraction

---
 workflows/examples/test_iframes.json         | 101 ++++
 workflows/workflow_use/controller/service.py | 131 ++++-
 workflows/workflow_use/controller/views.py   |   7 +-
 workflows/workflow_use/recorder/service.py   |  37 +-
 workflows/workflow_use/workflow/service.py   | 538 ++++++++-----------
 5 files changed, 457 insertions(+), 357 deletions(-)
 create mode 100644 workflows/examples/test_iframes.json

diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json
new file mode 100644
index 00000000..b1834355
--- /dev/null
+++ b/workflows/examples/test_iframes.json
@@ -0,0 +1,101 @@
+{
+  "name": "Recorded Workflow",
+  "description": "Recorded on 8/11/2025, 4:20:47 AM",
+  "version": "1.0.0",
+  "input_schema": [],
+  "steps": [
+    {
+      "type": "navigation",
+      "timestamp": 1754866228439,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "scroll",
+      "timestamp": 1754866228608,
+      "tabId": 388342781,
+      "targetId": 219,
+      "scrollX": 0,
+      "scrollY": 7,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866228634,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866228849,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "id(\"textareawrapper\")/div[1]/div[6]",
+      "cssSelector": "div.CodeMirror-scroll",
+      "elementTag": "DIV",
+      "elementText": "<!DOCTYPE html><html><body>​<h1>The iframe element</h1>​<iframe src=\"https://www.w3schools.com\" title=\"W3Schools Free Online Web Tutorials\"></iframe>​</body></html>​"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866230495,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866231531,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe",
+      "frameIdPath": "0",
+      "xpath": "body/div[2]/div[1]/a[4]",
+      "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]",
+      "elementTag": "A",
+      "elementText": ""
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866237707,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866238574,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/",
+      "frameUrl": "https://www.w3schools.com/",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[3]",
+      "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "JAVASCRIPT"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866242778,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    },
+    {
+      "type": "click",
+      "timestamp": 1754866242882,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/js/default.asp",
+      "frameUrl": "https://www.w3schools.com/js/default.asp",
+      "frameIdPath": "0.0",
+      "xpath": "id(\"subtopnav\")/a[1]",
+      "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]",
+      "elementTag": "A",
+      "elementText": "HTML"
+    },
+    {
+      "type": "navigation",
+      "timestamp": 1754866246395,
+      "tabId": 388342781,
+      "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 4205a952..2465d8cc 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -3,8 +3,9 @@
 
 from browser_use import Browser
 from browser_use.agent.views import ActionResult
-from browser_use.controller import Controller
-from browser_use.llm.base import BaseChatModel
+from browser_use.controller.service import Controller
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import PromptTemplate
 
 from workflow_use.controller.utils import get_best_element_handle, truncate_selector
 from workflow_use.controller.views import (
@@ -19,7 +20,7 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_ACTION_TIMEOUT_MS = 1000
+DEFAULT_ACTION_TIMEOUT_MS = 2500
 
 # List of default actions from browser_use.controller.service.Controller to disable
 # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case)
@@ -65,9 +66,7 @@ async def navigation(params: NavigationAction, browser_session: Browser) -> Acti
 			"""Navigate to the given URL."""
 			page = await browser_session.get_current_page()
 			await page.goto(params.url)
-			# Wait for page to load (CDP navigate doesn't wait automatically)
-			import asyncio
-			await asyncio.sleep(2)
+			await page.wait_for_load_state()
 
 			msg = f'🔗  Navigated to URL: {params.url}'
 			logger.info(msg)
@@ -84,16 +83,101 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse
 			page = await browser_session.get_current_page()
 			original_selector = params.cssSelector
 
+			# If frameUrl or frameIdPath are provided, narrow the search to that frame
+			def _select_context(pg):
+				try:
+					from playwright.async_api import Page, Frame
+					ctx: Page | Frame = pg
+					# If frame hints point to top document, stay on page
+					fid = getattr(params, 'frameIdPath', None)
+					furl = getattr(params, 'frameUrl', None)
+					curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else ''
+					if furl and curr_url and furl.split('#')[0] == curr_url:
+						return pg
+					if fid:
+						segs = [s for s in str(fid).split('.') if s != '']
+						if all(s == '0' for s in segs):
+							return pg
+						f = pg.main_frame
+						for s in segs[1:]:  # skip top marker
+							idx = int(s)
+							if 0 <= idx < len(f.child_frames):
+								f = f.child_frames[idx]
+							else:
+								return pg
+						return f
+					if furl:
+						from urllib.parse import urlparse
+						pf = urlparse(furl)
+						# If frameUrl equals current page URL (origin+path), stay on page
+						try:
+							from urllib.parse import urlparse as _u
+							cu = _u(curr_url)
+							if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path):
+								return pg
+						except Exception:
+							pass
+						for fr in pg.frames:
+							try:
+								ff = urlparse(fr.url)
+								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
+									return fr
+							except Exception:
+								continue
+				except Exception:
+					return pg
+				return ctx
+
+			# Fallback: search all frames for selector (prefer frames matching target origin)
+			async def _find_in_frames(pg, selector: str):
+				from urllib.parse import urlparse
+				prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or ''
+				pref_o = urlparse(prefer) if prefer else None
+				frames = list(pg.frames)
+				def score(fr):
+					if not pref_o:
+						return 0
+					try:
+						fo = urlparse(fr.url)
+						return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0
+					except Exception:
+						return 0
+				frames.sort(key=score, reverse=True)
+				for fr in frames:
+					try:
+						loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2))
+						return fr, loc, used
+					except Exception:
+						continue
+				return None, None, None
+
 			try:
-				locator, selector_used = await get_best_element_handle(
-					page,
-					params.cssSelector,
-					params,
-					timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
-				)
+				# Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared
+				curr = (page.url or '').split('#')[0]
+				declared_url = (getattr(params, 'url', None) or '').split('#')[0]
+				has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None))
+				if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url:
+					await page.goto(declared_url)
+					await page.wait_for_load_state()
+
+				ctx = _select_context(page)
+				try:
+					locator, selector_used = await get_best_element_handle(
+						ctx,
+						params.cssSelector,
+						params,
+						timeout_ms=DEFAULT_ACTION_TIMEOUT_MS,
+					)
+				except Exception:
+					# Fallback: search all frames
+					fr, locator, selector_used = await _find_in_frames(page, params.cssSelector)
+					if locator is None:
+						raise
+
 				await locator.click(force=True)
 
-				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})'
+				used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector
+				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
 			except Exception as e:
@@ -201,7 +285,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse
 		async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult:
 			"""Scroll the page by the given x/y pixel offsets."""
 			page = await browser_session.get_current_page()
-			await page.evaluate(f'() => window.scrollBy({params.scrollX}, {params.scrollY})')
+			await page.evaluate(f'window.scrollBy({params.scrollX}, {params.scrollY});')
 			msg = f'📜  Scrolled page by (x={params.scrollX}, y={params.scrollY})'
 			logger.info(msg)
 			return ActionResult(extracted_content=msg, include_in_memory=True)
@@ -220,18 +304,19 @@ async def extract_page_content(
 
 			strip = ['a', 'img']
 
-			# Get page HTML content using CDP evaluate
-			html_content = await page.evaluate('() => document.documentElement.outerHTML')
-			content = markdownify.markdownify(html_content, strip=strip)
+			content = markdownify.markdownify(await page.content(), strip=strip)
 
-			# Note: iframe content extraction is not yet supported in CDP-based implementation
-			# TODO: Implement iframe content extraction using CDP
+			# manually append iframe text into the content so it's readable by the LLM (includes cross-origin iframes)
+			for iframe in page.frames:
+				if iframe.url != page.url and not iframe.url.startswith('data:'):
+					content += f'\n\nIFRAME {iframe.url}:\n'
+					content += markdownify.markdownify(await iframe.content())
 
-			prompt = f'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {params.goal}, Page: {content}'
+			prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+			template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
 			try:
-				from browser_use.llm import UserMessage
-				output = await page_extraction_llm.ainvoke([UserMessage(content=prompt)])
-				msg = f'📄  Extracted from page\n: {output.completion}\n'
+				output = await page_extraction_llm.ainvoke(template.format(goal=params.goal, page=content))
+				msg = f'📄  Extracted from page\n: {output.content}\n'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
 			except Exception as e:
diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py
index 1df0900c..ffb25b4a 100644
--- a/workflows/workflow_use/controller/views.py
+++ b/workflows/workflow_use/controller/views.py
@@ -13,9 +13,8 @@ class Config:
 
 # Mixin for shared step metadata (timestamp and tab context)
 class StepMeta(_BaseExtra):
-	# timestamp: int
-	# tabId: int
-	pass
+	timestamp: int
+	tabId: int
 
 
 # Common optional fields present in recorder events
@@ -24,6 +23,8 @@ class RecorderBase(StepMeta):
 	elementTag: Optional[str] = None
 	elementText: Optional[str] = None
 	frameUrl: Optional[str] = None
+	frameIdPath: Optional[str] = None
+	url: Optional[str] = None
 	screenshot: Optional[str] = None
 
 
diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py
index 756ca6bb..5de21556 100644
--- a/workflows/workflow_use/recorder/service.py
+++ b/workflows/workflow_use/recorder/service.py
@@ -7,6 +7,7 @@
 from browser_use import Browser
 from browser_use.browser.profile import BrowserProfile
 from fastapi import FastAPI
+from patchright.async_api import async_playwright as patchright_async_playwright
 
 # Assuming views.py is correctly located for this import path
 from workflow_use.recorder.views import (
@@ -84,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 		async with self.final_workflow_processed_lock:
 			if not self.final_workflow_processed_flag and self.last_workflow_update_event:
 				print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).')
-				self.final_workflow_output = self.last_workflow_update_event.payload
+				wf = self.last_workflow_update_event.payload
+				# Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations
+				try:
+					clean_steps = []
+					for s in wf.steps:
+						st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None)
+						url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None)
+						if st == 'navigation':
+							if not url or url == 'about:blank':
+								continue
+							from urllib.parse import urlparse
+							host = urlparse(url).hostname or ''
+							blocked = any(
+								pat in host for pat in (
+									'doubleclick.net', 'googlesyndication.com', 'googleadservices.com',
+									'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net',
+									'googletagmanager.com', 'indexww.com', 'adtrafficquality.google'
+								)
+							)
+							if blocked:
+								continue
+						clean_steps.append(s)
+					wf.steps = clean_steps
+				except Exception as e:
+					print(f'[Service] Backend filter failed: {e}')
+				self.final_workflow_output = wf
 				self.final_workflow_processed_flag = True
 				processed_this_call = True
 
@@ -96,7 +122,7 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 			if trigger_reason == 'RecordingStoppedEvent' and self.browser:
 				print('[Service] Attempting to close browser due to RecordingStoppedEvent...')
 				try:
-					await self.browser.stop()
+					await self.browser.close()
 					print('[Service] Browser close command issued.')
 				except Exception as e_close:
 					print(f'[Service] Error closing browser on recording stop: {e_close}')
@@ -127,7 +153,8 @@ async def _launch_browser_and_wait(self):
 			)
 
 			# Create and configure browser
-			self.browser = Browser(browser_profile=profile)
+			playwright = await patchright_async_playwright().start()
+			self.browser = Browser(browser_profile=profile, playwright=playwright)
 
 			print('[Service] Starting browser with extensions...')
 			await self.browser.start()
@@ -150,7 +177,7 @@ async def _launch_browser_and_wait(self):
 			print('[Service] Browser task cancelled.')
 			if self.browser:
 				try:
-					await self.browser.stop()
+					await self.browser.close()
 				except:
 					pass  # Best effort
 			raise  # Re-raise to be caught by gather
@@ -218,7 +245,7 @@ async def capture_workflow(self) -> Optional[WorkflowDefinitionSchema]:
 				print('[Service] Ensuring browser is closed in cleanup...')
 				try:
 					self.browser.browser_profile.keep_alive = False
-					await self.browser.stop()
+					await self.browser.close()
 				except Exception as e_browser_close:
 					print(f'[Service] Error closing browser in final cleanup: {e_browser_close}')
 				# self.browser = None
diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py
index e8bedad7..d21da2e8 100644
--- a/workflows/workflow_use/workflow/service.py
+++ b/workflows/workflow_use/workflow/service.py
@@ -6,25 +6,32 @@
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, TypeVar
-from typing import cast as _cast
 
 from browser_use import Agent, Browser
 from browser_use.agent.views import ActionResult, AgentHistoryList
-from browser_use.llm.base import BaseChatModel
-from browser_use.llm import SystemMessage, UserMessage
-from pydantic import BaseModel, Field, create_model
+from langchain.agents import AgentExecutor, create_tool_calling_agent
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.tools import StructuredTool
+from pydantic import BaseModel, create_model
 
 from workflow_use.controller.service import WorkflowController
 from workflow_use.controller.utils import get_best_element_handle
 from workflow_use.schema.views import (
 	AgenticWorkflowStep,
+	ClickStep,
 	DeterministicWorkflowStep,
+	InputStep,
+	KeyPressStep,
+	NavigationStep,
+	ScrollStep,
+	SelectChangeStep,
 	WorkflowDefinitionSchema,
 	WorkflowInputSchemaDefinition,
 	WorkflowStep,
 )
-from workflow_use.workflow.prompts import AGENT_STEP_SYSTEM_PROMPT, STRUCTURED_OUTPUT_PROMPT
-from workflow_use.workflow.step_agent.controller import WorkflowStepAgentController
+from workflow_use.workflow.prompts import STRUCTURED_OUTPUT_PROMPT, WORKFLOW_FALLBACK_PROMPT_TEMPLATE
 from workflow_use.workflow.views import WorkflowRunOutput
 
 logger = logging.getLogger(__name__)
@@ -40,13 +47,12 @@ class Workflow:
 	def __init__(
 		self,
 		workflow_schema: WorkflowDefinitionSchema,
-		llm: BaseChatModel,
 		*,
 		controller: WorkflowController | None = None,
 		browser: Browser | None = None,
+		llm: BaseChatModel | None = None,
 		page_extraction_llm: BaseChatModel | None = None,
 		fallback_to_agent: bool = True,
-		use_cloud: bool = False,
 	) -> None:
 		"""Initialize a new Workflow instance from a schema object.
 
@@ -56,16 +62,20 @@ def __init__(
 			browser: Optional Browser instance to use for browser automation
 			llm: Optional language model for fallback agent functionality
 			fallback_to_agent: Whether to fall back to agent-based execution on step failure
-			use_cloud: Whether to use browser-use cloud browser service instead of local browser
 
 		Raises:
 			ValueError: If the workflow schema is invalid (though Pydantic handles most).
 		"""
 		self.schema = workflow_schema  # Store the schema object
 
+		self.name = self.schema.name
+		self.description = self.schema.description
+		self.version = self.schema.version
+		self.steps = self.schema.steps
+
 		self.controller = controller or WorkflowController()
 
-		self.browser = browser or Browser(use_cloud=use_cloud)
+		self.browser = browser or Browser()
 
 		# Hack to not close it after agent kicks in
 		self.browser.browser_profile.keep_alive = True
@@ -85,12 +95,11 @@ def __init__(
 	def load_from_file(
 		cls,
 		file_path: str | Path,
-		llm: BaseChatModel,
 		*,
 		controller: WorkflowController | None = None,
 		browser: Browser | None = None,
+		llm: BaseChatModel | None = None,
 		page_extraction_llm: BaseChatModel | None = None,
-		use_cloud: bool = False,
 	) -> Workflow:
 		"""Load a workflow from a file."""
 		with open(file_path, 'r', encoding='utf-8') as f:
@@ -102,7 +111,6 @@ def load_from_file(
 			browser=browser,
 			llm=llm,
 			page_extraction_llm=page_extraction_llm,
-			use_cloud=use_cloud,
 		)
 
 	# --- Runners ---
@@ -127,167 +135,125 @@ def truncate_selector(selector: str) -> str:
 
 		# Determine if this is not the last step, and extract next step's cssSelector if available
 		current_index = step_index
-		if current_index < len(self.schema.steps) - 1:
-			next_step = self.schema.steps[current_index + 1]
+		if current_index < len(self.steps) - 1:
+			next_step = self.steps[current_index + 1]
 			next_step_resolved = self._resolve_placeholders(next_step)
 			css_selector = getattr(next_step_resolved, 'cssSelector', None)
 			if css_selector:
 				try:
+					await self.browser._wait_for_stable_network()
 					page = await self.browser.get_current_page()
 
-					logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}')
-					locator, selector_used = await get_best_element_handle(
-						page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT
-					)
-					logger.info(f'Element with selector found: {truncate_selector(selector_used)}')
+					# If the next step declares a URL/frameUrl and it does not match the current page URL,
+					# skip waiting for its element on the current page (prevents false failures like step 7).
+					curr_url = (page.url or '').split('#')[0]
+					declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0]
+					if declared_next_url and declared_next_url != curr_url:
+						logger.info(
+							f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})"
+						)
+					else:
+						logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}')
+						locator, selector_used = await get_best_element_handle(
+							page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT
+						)
+						logger.info(f'Element with selector found: {truncate_selector(selector_used)}')
 				except Exception as e:
 					logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}')
 					raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e
 
 		return result
 
-	def _format_agent_step_context(self, current_step: AgenticWorkflowStep, step_index: int) -> str:
-		"""Format the workflow step context for the agent with extended context (last 2, current, next 2 steps)."""
-
-		def format_step_info(step: WorkflowStep, step_num: int) -> str:
-			"""Format step information consistently."""
-			info = [f'Step {step_num}: Type: {step.type}']
-			if step.description:
-				info.append(f'Description: {step.description}')
-			# For agent steps, show the task
-			if isinstance(step, AgenticWorkflowStep):
-				info.append(f'Task: {step.task}')
-			return '\n'.join(info)
-
-		sections = []
-		total_steps = len(self.schema.steps)
-
-		# Add previous steps context (last 2 steps)
-		prev_steps = []
-		for i in range(max(0, step_index - 2), step_index):
-			prev_step = self.schema.steps[i]
-			prev_steps.append(format_step_info(prev_step, i + 1))
-
-		if prev_steps:
-			sections.extend(['=== PREVIOUS STEPS (FOR CONTEXT ONLY) ===', '\n\n'.join(prev_steps), ''])
-
-		# Add current step context
-		sections.extend(['=== CURRENT STEP (YOUR TASK) ===', format_step_info(current_step, step_index + 1), ''])
-
-		# Add next steps context (next 2 steps)
-		next_steps = []
-		for i in range(step_index + 1, min(total_steps, step_index + 3)):
-			next_step = self.schema.steps[i]
-			next_steps.append(format_step_info(next_step, i + 1))
-
-		if next_steps:
-			sections.extend(
-				[
-					'=== NEXT STEPS (FOR CONTEXT ONLY) ===',
-					'\n\n'.join(next_steps),
-				]
-			)
-
-		return '\n'.join(sections)
-
-	async def _run_agent_step(self, step: AgenticWorkflowStep, step_index: int) -> AgentHistoryList:
+	async def _run_agent_step(self, step: AgenticWorkflowStep) -> AgentHistoryList:
 		"""Spin-up an Agent based on step dictionary."""
-		# Create contextual task with extended context (last 2, current, next 2 steps)
-		contextual_task = self._format_agent_step_context(step, step_index)
-
-		# logger.info(f'Contextual task: {contextual_task}')
-
-		# 		task = """
-		# {step.task}
+		if self.llm is None:
+			raise ValueError("An 'llm' instance must be supplied for agent-based steps")
 
-		# Please do not make up any fake data.
-		# """
+		task: str = step.task
+		max_steps: int = step.max_steps or 5
 
 		agent = Agent(
-			task=step.task,  # Only the current step task goes into ultimate task
-			message_context=contextual_task,  # Extended context with surrounding steps
+			task=task,
 			llm=self.llm,
 			browser_session=self.browser,
-			controller=WorkflowStepAgentController(),
-			# use_vision=True,  # Consider making this configurable via WorkflowStep schema
-			override_system_message=AGENT_STEP_SYSTEM_PROMPT,
+			use_vision=True,  # Consider making this configurable via WorkflowStep schema
+		)
+		return await agent.run(max_steps=max_steps)
+
+	async def _fallback_to_agent(
+		self,
+		step_resolved: WorkflowStep,
+		step_index: int,
+		error: Exception | str | None = None,
+	) -> AgentHistoryList:
+		"""Handle step failure by delegating to an agent."""
+		if self.llm is None:
+			raise ValueError("Cannot fall back to agent: An 'llm' instance must be supplied")
+		# print('Workflow steps:', step_resolved)
+		# Extract details from the failed step dictionary
+		failed_action_name = step_resolved.type
+		failed_params = step_resolved.model_dump()
+		step_description = step_resolved.description or 'No description provided'
+		error_msg = str(error) if error else 'Unknown error'
+		total_steps = len(self.steps)
+		fail_details = (
+			f"step={step_index + 1}/{total_steps}, action='{failed_action_name}', "
+			f"description='{step_description}', params={str(failed_params)}, error='{error_msg}'"
 		)
 
-		return await agent.run()
-
-	# async def _fallback_to_agent(
-	# 	self,
-	# 	step_resolved: WorkflowStep,
-	# 	step_index: int,
-	# 	error: Exception | str | None = None,
-	# ) -> AgentHistoryList:
-	# 	"""Handle step failure by delegating to an agent."""
-
-	# 	# print('Workflow steps:', step_resolved)
-	# 	# Extract details from the failed step dictionary
-	# 	failed_action_name = step_resolved.type
-	# 	failed_params = step_resolved.model_dump()
-	# 	step_description = step_resolved.description or 'No description provided'
-	# 	error_msg = str(error) if error else 'Unknown error'
-	# 	total_steps = len(self.steps)
-	# 	fail_details = (
-	# 		f"step={step_index + 1}/{total_steps}, action='{failed_action_name}', "
-	# 		f"description='{step_description}', params={str(failed_params)}, error='{error_msg}'"
-	# 	)
-
-	# 	# Determine the failed_value based on step type and attributes
-	# 	failed_value = None
-	# 	description_prefix = f'Purpose: {step_description}. ' if step_description else ''
-
-	# 	if isinstance(step_resolved, NavigationStep):
-	# 		failed_value = f'{description_prefix}Navigate to URL: {step_resolved.url}'
-	# 	elif isinstance(step_resolved, ClickStep):
-	# 		# element_info = step_resolved.elementText or step_resolved.cssSelector
-	# 		# failed_value = f"{description_prefix}Click element: {element_info}"
-	# 		failed_value = f'Find and click element with description: {step_resolved.description}'
-	# 	elif isinstance(step_resolved, InputStep):
-	# 		failed_value = f"{description_prefix}Input text: '{step_resolved.value}' into element."
-	# 	elif isinstance(step_resolved, SelectChangeStep):
-	# 		failed_value = f"{description_prefix}Select option: '{step_resolved.selectedText}' in dropdown."
-	# 	elif isinstance(step_resolved, KeyPressStep):
-	# 		failed_value = f"{description_prefix}Press key: '{step_resolved.key}'"
-	# 	elif isinstance(step_resolved, ScrollStep):
-	# 		failed_value = f'{description_prefix}Scroll to position: (x={step_resolved.scrollX}, y={step_resolved.scrollY})'
-	# 	else:
-	# 		failed_value = f"{description_prefix}No specific target value available for action '{failed_action_name}'"
-
-	# 	# Build workflow overview using the stored dictionaries
-	# 	workflow_overview_lines: list[str] = []
-	# 	for idx, step in enumerate(self.steps):
-	# 		desc = step.description or ''
-	# 		step_type_info = step.type
-	# 		details = step.model_dump()
-	# 		workflow_overview_lines.append(f'  {idx + 1}. ({step_type_info}) {desc} - {details}')
-	# 	workflow_overview = '\n'.join(workflow_overview_lines)
-	# 	# print(workflow_overview)
-
-	# 	# Build the fallback task with the failed_value
-	# 	fallback_task = WORKFLOW_FALLBACK_PROMPT_TEMPLATE.format(
-	# 		step_index=step_index + 1,
-	# 		total_steps=len(self.steps),
-	# 		workflow_details=workflow_overview,
-	# 		action_type=failed_action_name,
-	# 		fail_details=fail_details,
-	# 		failed_value=failed_value,
-	# 		step_description=step_description,
-	# 	)
-	# 	logger.info(f'Agent fallback task: {fallback_task}')
-
-	# 	# Prepare agent step config based on the failed step, adding task
-	# 	agent_step_config = AgenticWorkflowStep(
-	# 		type='agent',
-	# 		task=fallback_task,
-	# 		max_steps=5,
-	# 		output=None,
-	# 		description='Fallback agent to handle step failure',
-	# 	)
-
-	# 	return await self._run_agent_step(agent_step_config)
+		# Determine the failed_value based on step type and attributes
+		failed_value = None
+		description_prefix = f'Purpose: {step_description}. ' if step_description else ''
+
+		if isinstance(step_resolved, NavigationStep):
+			failed_value = f'{description_prefix}Navigate to URL: {step_resolved.url}'
+		elif isinstance(step_resolved, ClickStep):
+			# element_info = step_resolved.elementText or step_resolved.cssSelector
+			# failed_value = f"{description_prefix}Click element: {element_info}"
+			failed_value = f'Find and click element with description: {step_resolved.description}'
+		elif isinstance(step_resolved, InputStep):
+			failed_value = f"{description_prefix}Input text: '{step_resolved.value}' into element."
+		elif isinstance(step_resolved, SelectChangeStep):
+			failed_value = f"{description_prefix}Select option: '{step_resolved.selectedText}' in dropdown."
+		elif isinstance(step_resolved, KeyPressStep):
+			failed_value = f"{description_prefix}Press key: '{step_resolved.key}'"
+		elif isinstance(step_resolved, ScrollStep):
+			failed_value = f'{description_prefix}Scroll to position: (x={step_resolved.scrollX}, y={step_resolved.scrollY})'
+		else:
+			failed_value = f"{description_prefix}No specific target value available for action '{failed_action_name}'"
+
+		# Build workflow overview using the stored dictionaries
+		workflow_overview_lines: list[str] = []
+		for idx, step in enumerate(self.steps):
+			desc = step.description or ''
+			step_type_info = step.type
+			details = step.model_dump()
+			workflow_overview_lines.append(f'  {idx + 1}. ({step_type_info}) {desc} - {details}')
+		workflow_overview = '\n'.join(workflow_overview_lines)
+		# print(workflow_overview)
+
+		# Build the fallback task with the failed_value
+		fallback_task = WORKFLOW_FALLBACK_PROMPT_TEMPLATE.format(
+			step_index=step_index + 1,
+			total_steps=len(self.steps),
+			workflow_details=workflow_overview,
+			action_type=failed_action_name,
+			fail_details=fail_details,
+			failed_value=failed_value,
+			step_description=step_description,
+		)
+		logger.info(f'Agent fallback task: {fallback_task}')
+
+		# Prepare agent step config based on the failed step, adding task
+		agent_step_config = AgenticWorkflowStep(
+			type='agent',
+			task=fallback_task,
+			max_steps=5,
+			output=None,
+			description='Fallback agent to handle step failure',
+		)
+
+		return await self._run_agent_step(agent_step_config)
 
 	def _validate_inputs(self, inputs: dict[str, Any]) -> None:
 		"""Validate provided inputs against the workflow's input schema definition."""
@@ -421,35 +387,31 @@ async def _execute_step(self, step_index: int, step_resolved: WorkflowStep) -> A
 				logger.warning(
 					f'Deterministic step {step_index + 1} ({action_name}) failed: {e}. Attempting fallback with agent.'
 				)
-
-				raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}')
-
-				# if self.fallback_to_agent:
-				# 	result = await self._fallback_to_agent(step_resolved, step_index, e)
-				# 	if not result.is_successful():
-				# 		raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed even after fallback')
-				# else:
-				# 	raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}')
-
+				if self.llm is None:
+					raise ValueError('Cannot fall back to agent: LLM instance required.')
+				if self.fallback_to_agent:
+					result = await self._fallback_to_agent(step_resolved, step_index, e)
+					if not result.is_successful():
+						raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed even after fallback')
+				else:
+					raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}')
 		elif isinstance(step_resolved, AgenticWorkflowStep):
 			# Use task key from step dictionary
 			task_description = step_resolved.task
 			logger.info(f'Running agent task: {task_description}')
 			try:
-				result = await self._run_agent_step(step_resolved, step_index)
+				result = await self._run_agent_step(step_resolved)
 				if not result.is_successful():
 					logger.warning(f'Agent step {step_index + 1} failed evaluation.')
 					raise ValueError(f'Agent step {step_index + 1} failed evaluation.')
-
 			except Exception as e:
-				raise ValueError(f'Agent step {step_index + 1} failed: {e}. (Agent fallback is disabled)')
-
 				if self.fallback_to_agent:
 					logger.warning(f'Agent step {step_index + 1} failed: {e}. Attempting fallback with agent.')
-
-					# result = await self._fallback_to_agent(step_resolved, step_index, e)
-					# if not result.is_successful():
-					# 	raise ValueError(f'Agent step {step_index + 1} failed even after fallback')
+					if self.llm is None:
+						raise ValueError('Cannot fall back to agent: LLM instance required.')
+					result = await self._fallback_to_agent(step_resolved, step_index, e)
+					if not result.is_successful():
+						raise ValueError(f'Agent step {step_index + 1} failed even after fallback')
 				else:
 					raise ValueError(f'Agent step {step_index + 1} failed: {e}')
 
@@ -463,7 +425,7 @@ async def _convert_results_to_output_model(
 	) -> T:
 		"""Convert workflow results to a specified output model.
 
-		Filters ActionResults with extracted_content, then uses LLM to parse
+		Filters ActionResults with extracted_content, then uses LangChain to parse
 		all extracted texts into the structured output model.
 
 		Args:
@@ -476,6 +438,9 @@ async def _convert_results_to_output_model(
 		if not results:
 			raise ValueError('No results to convert')
 
+		if self.llm is None:
+			raise ValueError('LLM is required for structured output conversion')
+
 		# Extract all content from ActionResults
 		extracted_contents = []
 
@@ -496,13 +461,15 @@ async def _convert_results_to_output_model(
 		# Combine all extracted contents
 		combined_text = '\n\n'.join(extracted_contents)
 
-		messages = [
-			SystemMessage(content=STRUCTURED_OUTPUT_PROMPT),
-			UserMessage(content=combined_text),
+		messages: list[BaseMessage] = [
+			AIMessage(content=STRUCTURED_OUTPUT_PROMPT),
+			HumanMessage(content=combined_text),
 		]
 
-		response = await self.llm.ainvoke(messages, output_format=output_model)
-		return response.completion
+		chain = self.llm.with_structured_output(output_model)
+		chain_result: T = await chain.ainvoke(messages)  # type: ignore
+
+		return chain_result
 
 	async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None):
 		"""Run a *single* workflow step asynchronously and return its result.
@@ -516,8 +483,8 @@ async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None):
 				are validated and injected into :pyattr:`context`.  Subsequent
 				calls can omit *inputs* as :pyattr:`context` is already populated.
 		"""
-		if not (0 <= step_index < len(self.schema.steps)):
-			raise IndexError(f'step_index {step_index} is out of range for workflow with {len(self.schema.steps)} steps')
+		if not (0 <= step_index < len(self.steps)):
+			raise IndexError(f'step_index {step_index} is out of range for workflow with {len(self.steps)} steps')
 
 		# Initialise/augment context once with the provided inputs
 		if inputs is not None or not self.context:
@@ -531,7 +498,7 @@ async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None):
 				self.context.update(runtime_inputs)
 
 		async with self.browser:
-			raw_step_cfg = self.schema.steps[step_index]
+			raw_step_cfg = self.steps[step_index]
 			step_resolved = self._resolve_placeholders(raw_step_cfg)
 			result = await self._execute_step(step_index, step_resolved)
 			# Persist outputs (if declared) for future steps
@@ -573,8 +540,9 @@ async def run(
 
 		await self.browser.start()
 		try:
-			for step_index, step_dict in enumerate(self.schema.steps):  # self.steps now holds dictionaries
+			for step_index, step_dict in enumerate(self.steps):  # self.steps now holds dictionaries
 				await asyncio.sleep(0.1)
+				await self.browser._wait_for_stable_network()
 
 				# Check if cancellation was requested
 				if cancel_event and cancel_event.is_set():
@@ -583,7 +551,7 @@ async def run(
 
 				# Use description from the step dictionary
 				step_description = step_dict.description or 'No description provided'
-				logger.info(f'--- Running Step {step_index + 1}/{len(self.schema.steps)} -- {step_description} ---')
+				logger.info(f'--- Running Step {step_index + 1}/{len(self.steps)} -- {step_description} ---')
 				# Resolve placeholders using the current context (works on the dictionary)
 				step_resolved = self._resolve_placeholders(step_dict)
 
@@ -604,67 +572,39 @@ async def run(
 			# Clean-up browser after finishing workflow
 			if close_browser_at_end:
 				self.browser.browser_profile.keep_alive = False
-				await self.browser.stop()
+				await self.browser.close()
 
 		return WorkflowRunOutput(step_results=results, output_model=output_model_result)
 
 	# ------------------------------------------------------------------
-	# LLM tool wrapper
+	# LangChain tool wrapper
 	# ------------------------------------------------------------------
 
 	def _build_input_model(self) -> type[BaseModel]:
-		"""Return a *pydantic* model matching the workflow's ``input_schema`` section.
-
-		This creates a dynamic Pydantic model that includes format information in field
-		descriptions, making format requirements visible to LLMs when workflows are used as tools.
-		"""
-
+		"""Return a *pydantic* model matching the workflow's ``input_schema`` section."""
 		if not self.inputs_def:
 			# No declared inputs -> generate an empty model
 			# Use schema name for uniqueness, fallback if needed
 			model_name = f'{(self.schema.name or "Workflow").replace(" ", "_")}_NoInputs'
 			return create_model(model_name)
 
-		# Map workflow input types to Python types
 		type_mapping = {
 			'string': str,
 			'number': float,
-			'bool': bool,
+			'bool': bool,  # Added boolean type
 		}
-
-		# Build fields dictionary for create_model()
 		fields: Dict[str, tuple[type, Any]] = {}
-
 		for input_def in self.inputs_def:
 			name = input_def.name
 			type_str = input_def.type
 			py_type = type_mapping.get(type_str)
-
 			if py_type is None:
 				raise ValueError(f'Unsupported input type: {type_str!r} for field {name!r}')
-
-			# Create field description with format information if available
-			# This helps LLMs understand expected input formats when workflow is used as a tool
-			field_description = None
-			if hasattr(input_def, 'format') and input_def.format:
-				field_description = f'Format: {input_def.format}'
-
-			# Build field tuple: (type, default_or_field_info)
 			# Pydantic's create_model uses ... (Ellipsis) to mark required fields
-			if input_def.required:
-				if field_description:
-					# Required field with format description
-					fields[name] = (py_type, Field(..., description=field_description))
-				else:
-					# Required field without format description
-					fields[name] = (py_type, ...)
-			else:
-				if field_description:
-					# Optional field with format description
-					fields[name] = (py_type, Field(None, description=field_description))
-				else:
-					# Optional field without format description
-					fields[name] = (py_type, None)
+			default = ... if input_def.required else None
+			fields[name] = (py_type, default)
+
+		from typing import cast as _cast
 
 		# The raw ``create_model`` helper from Pydantic deliberately uses *dynamic*
 		# signatures, which the static type checker cannot easily verify.  We cast
@@ -674,119 +614,65 @@ def _build_input_model(self) -> type[BaseModel]:
 			**_cast(Dict[str, Any], fields),
 		)
 
-	async def run_as_tool(self, prompt: str) -> str:
-		"""Run the workflow with inputs parsed from a natural language prompt.
-
-		Args:
-			prompt: Natural language description of the task and inputs
+	def as_tool(self, *, name: str | None = None, description: str | None = None):  # noqa: D401
+		"""Expose the entire workflow as a LangChain *StructuredTool* instance.
 
-		Returns:
-			JSON string with workflow results
+		The generated tool validates its arguments against the workflow's input
+		schema (if present) and then returns the JSON-serialised output of
+		:py:meth:`run`.
 		"""
-		if self.llm is None:
-			raise ValueError("LLM is required for run_as_tool to parse inputs from prompt")
-
-		# Parse inputs from prompt using LLM
-		input_model = self._build_input_model()
-
-		system_prompt = f"""You are a helpful assistant that extracts workflow input parameters from user prompts.
-The workflow requires the following inputs:
-{json.dumps(input_model.model_json_schema(), indent=2)}
-
-Extract the values from the user's prompt and return them in the required format."""
-
-		messages = [
-			SystemMessage(content=system_prompt),
-			UserMessage(content=prompt)
-		]
-
-		response = await self.llm.ainvoke(messages, output_format=input_model)
-		inputs = response.completion.model_dump()
-
-		# Run the workflow with parsed inputs
-		result = await self.run(inputs=inputs, close_browser_at_end=True)
-
-		# Return results as JSON
-		output = {
-			"success": True,
-			"steps_executed": len(result.step_results),
-			"inputs_used": inputs,
-			"context": self.context
-		}
-
-		return json.dumps(output, indent=2)
-
-	async def run_with_no_ai(
-		self,
-		inputs: dict[str, Any] | None = None,
-		close_browser_at_end: bool = True,
-		cancel_event: asyncio.Event | None = None,
-		output_model: type[T] | None = None,
-	) -> WorkflowRunOutput[T]:
-		"""Execute the workflow using semantic abstraction without any AI/LLM involvement.
 
-		This method uses semantic mapping to convert visible text to deterministic selectors,
-		avoiding expensive LLM calls and fragile CSS selectors.
+		InputModel = self._build_input_model()
+		# Use schema name as default, sanitize for tool name requirements
+		default_name = ''.join(c if c.isalnum() else '_' for c in self.name)
+		tool_name = name or default_name[:50]
+		doc = description or self.description  # Use schema description
+
+		# `self` is closed over via the inner function so we can keep state.
+		async def _invoke(**kwargs):  # type: ignore[override]
+			logger.info(f'Running workflow as tool with inputs: {kwargs}')
+			augmented_inputs = kwargs.copy() if kwargs else {}
+			for input_def in self.inputs_def:
+				if not input_def.required and input_def.name not in augmented_inputs:
+					augmented_inputs[input_def.name] = ''
+			result = await self.run(inputs=augmented_inputs)
+			# Serialise non-string output so models that expect a string tool
+			# response still work.
+			try:
+				return _json.dumps(result, default=str)
+			except Exception:
+				return str(result)
 
-		Args:
-			inputs: Optional dictionary of workflow inputs
-			close_browser_at_end: Whether to close the browser when done
-			cancel_event: Optional event to signal cancellation
-			output_model: Optional Pydantic model class to convert results to
+		return StructuredTool.from_function(
+			coroutine=_invoke,
+			name=tool_name,
+			description=doc,
+			args_schema=InputModel,
+		)
 
-		Returns:
-			WorkflowRunOutput containing all step results
+	async def run_as_tool(self, prompt: str) -> str:
 		"""
-		from workflow_use.workflow.semantic_executor import SemanticWorkflowExecutor
-
-		runtime_inputs = inputs or {}
-		# 1. Validate inputs against definition
-		self._validate_inputs(runtime_inputs)
-		# 2. Initialize context with validated inputs
-		self.context = runtime_inputs.copy()  # Start with a fresh context
-
-		results: List[ActionResult | AgentHistoryList] = []
-
-		await self.browser.start()
-		semantic_executor = SemanticWorkflowExecutor(self.browser, page_extraction_llm=self.page_extraction_llm)
-
-		try:
-			for step_index, step_dict in enumerate(self.schema.steps):
-				await asyncio.sleep(0.1)
+		Run the workflow with a prompt and automatically parse the required variables.
 
-				# Check if cancellation was requested
-				if cancel_event and cancel_event.is_set():
-					logger.info('Cancellation requested - stopping workflow execution')
-					break
-
-				# Use description from the step dictionary
-				step_description = step_dict.description or 'No description provided'
-				logger.info(f'--- Running Step {step_index + 1}/{len(self.schema.steps)} -- {step_description} ---')
-
-				# Resolve placeholders using the current context (works on the dictionary)
-				step_resolved = self._resolve_placeholders(step_dict)
-
-				# Only process deterministic steps (no agent steps)
-				if step_resolved.type == 'agent':
-					raise Exception(f"Agent steps are not supported in run_with_no_ai mode. Step {step_index + 1} is an agent step.")
-
-				# Execute step using semantic executor
-				result = await semantic_executor.execute_step(step_resolved)
-
-				results.append(result)
-				# Persist outputs using the resolved step dictionary
-				self._store_output(step_resolved, result)
-				logger.info(f'--- Finished Step {step_index + 1} ---\n')
-
-			# Convert results to output model if requested
-			output_model_result: T | None = None
-			if output_model:
-				output_model_result = await self._convert_results_to_output_model(results, output_model)
+		@dev Uses AgentExecutor to properly handle the tool invocation loop.
+		"""
 
-		finally:
-			# Clean-up browser after finishing workflow
-			if close_browser_at_end:
-				self.browser.browser_profile.keep_alive = False
-				await self.browser.stop()
+		# For now I kept it simple but one could think of using a react agent here.
+		if self.llm is None:
+			raise ValueError("Cannot run as tool: An 'llm' instance must be supplied for tool-based steps")
+
+		prompt_template = ChatPromptTemplate.from_messages(
+			[
+				('system', 'You are a helpful assistant'),
+				('human', '{input}'),
+				# Placeholders fill up a **list** of messages
+				('placeholder', '{agent_scratchpad}'),
+			]
+		)
 
-		return WorkflowRunOutput(step_results=results, output_model=output_model_result)
+		# Create the workflow tool
+		workflow_tool = self.as_tool()
+		agent = create_tool_calling_agent(self.llm, [workflow_tool], prompt_template)
+		agent_executor = AgentExecutor(agent=agent, tools=[workflow_tool])
+		result = await agent_executor.ainvoke({'input': prompt})
+		return result['output']

From 015fb5c3b1cb6bf1b47c2ae063ae68b25a11fc1c Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <mishradiveyam@gmail.com>
Date: Sat, 25 Oct 2025 23:37:14 +0530
Subject: [PATCH 09/13] chore: address review feedback

---
 workflows/workflow_use/controller/service.py | 34 +++++++++-----------
 workflows/workflow_use/recorder/service.py   | 11 +++++--
 workflows/workflow_use/workflow/service.py   |  4 +--
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 2465d8cc..505ef6b6 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -6,6 +6,7 @@
 from browser_use.controller.service import Controller
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.prompts import PromptTemplate
+from urllib.parse import urlparse
 
 from workflow_use.controller.utils import get_best_element_handle, truncate_selector
 from workflow_use.controller.views import (
@@ -107,30 +108,27 @@ def _select_context(pg):
 								return pg
 						return f
 					if furl:
-						from urllib.parse import urlparse
-						pf = urlparse(furl)
-						# If frameUrl equals current page URL (origin+path), stay on page
+					pf = urlparse(furl)
+					# If frameUrl equals current page URL (origin+path), stay on page
+					try:
+						cu = urlparse(curr_url)
+						if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path):
+							return pg
+					except Exception:
+						pass
+					for fr in pg.frames:
 						try:
-							from urllib.parse import urlparse as _u
-							cu = _u(curr_url)
-							if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path):
-								return pg
+							ff = urlparse(fr.url)
+							if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
+								return fr
 						except Exception:
-							pass
-						for fr in pg.frames:
-							try:
-								ff = urlparse(fr.url)
-								if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
-									return fr
-							except Exception:
-								continue
+							continue
 				except Exception:
 					return pg
 				return ctx
 
 			# Fallback: search all frames for selector (prefer frames matching target origin)
 			async def _find_in_frames(pg, selector: str):
-				from urllib.parse import urlparse
 				prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or ''
 				pref_o = urlparse(prefer) if prefer else None
 				frames = list(pg.frames)
@@ -176,7 +174,7 @@ def score(fr):
 
 				await locator.click(force=True)
 
-				used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector
+				used_str = selector_used if isinstance(selector_used, str) else params.cssSelector
 				msg = f'🖱️  Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})'
 				logger.info(msg)
 				return ActionResult(extracted_content=msg, include_in_memory=True)
@@ -285,7 +283,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse
 		async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult:
 			"""Scroll the page by the given x/y pixel offsets."""
 			page = await browser_session.get_current_page()
-			await page.evaluate(f'window.scrollBy({params.scrollX}, {params.scrollY});')
+			await page.evaluate('(x, y) => window.scrollBy(x, y)', params.scrollX, params.scrollY)
 			msg = f'📜  Scrolled page by (x={params.scrollX}, y={params.scrollY})'
 			logger.info(msg)
 			return ActionResult(extracted_content=msg, include_in_memory=True)
diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py
index 5de21556..4c2f217f 100644
--- a/workflows/workflow_use/recorder/service.py
+++ b/workflows/workflow_use/recorder/service.py
@@ -2,6 +2,7 @@
 import json
 import pathlib
 from typing import Optional
+from urllib.parse import urlparse
 
 import uvicorn
 from browser_use import Browser
@@ -88,14 +89,18 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str):
 				wf = self.last_workflow_update_event.payload
 				# Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations
 				try:
+					def _step_field(step, field: str):
+						if isinstance(step, dict):
+							return step.get(field)
+						return getattr(step, field, None)
+
 					clean_steps = []
 					for s in wf.steps:
-						st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None)
-						url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None)
+						st = _step_field(s, 'type')
+						url = _step_field(s, 'url')
 						if st == 'navigation':
 							if not url or url == 'about:blank':
 								continue
-							from urllib.parse import urlparse
 							host = urlparse(url).hostname or ''
 							blocked = any(
 								pat in host for pat in (
diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py
index d21da2e8..4b7c1ca4 100644
--- a/workflows/workflow_use/workflow/service.py
+++ b/workflows/workflow_use/workflow/service.py
@@ -11,7 +11,7 @@
 from browser_use.agent.views import ActionResult, AgentHistoryList
 from langchain.agents import AgentExecutor, create_tool_calling_agent
 from langchain_core.language_models.chat_models import BaseChatModel
-from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.tools import StructuredTool
 from pydantic import BaseModel, create_model
@@ -462,7 +462,7 @@ async def _convert_results_to_output_model(
 		combined_text = '\n\n'.join(extracted_contents)
 
 		messages: list[BaseMessage] = [
-			AIMessage(content=STRUCTURED_OUTPUT_PROMPT),
+			SystemMessage(content=STRUCTURED_OUTPUT_PROMPT),
 			HumanMessage(content=combined_text),
 		]
 

From 5bfd65ba7b30e461c9a3df4f0f422697deca94dc Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <mishradiveyam@gmail.com>
Date: Sat, 25 Oct 2025 23:51:25 +0530
Subject: [PATCH 10/13] fix: default optional workflow inputs to None

---
 workflows/workflow_use/workflow/service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py
index 4b7c1ca4..176709dc 100644
--- a/workflows/workflow_use/workflow/service.py
+++ b/workflows/workflow_use/workflow/service.py
@@ -634,7 +634,7 @@ async def _invoke(**kwargs):  # type: ignore[override]
 			augmented_inputs = kwargs.copy() if kwargs else {}
 			for input_def in self.inputs_def:
 				if not input_def.required and input_def.name not in augmented_inputs:
-					augmented_inputs[input_def.name] = ''
+					augmented_inputs[input_def.name] = None
 			result = await self.run(inputs=augmented_inputs)
 			# Serialise non-string output so models that expect a string tool
 			# response still work.

From 207b64622f1a8a1477aa07e5e042fca7f837ad46 Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <mishradiveyam@gmail.com>
Date: Sun, 26 Oct 2025 02:26:00 +0530
Subject: [PATCH 11/13] refactor(controller): avoid direct playwright import

---
 workflows/workflow_use/controller/service.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 505ef6b6..5e8fa56f 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -87,8 +87,7 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse
 			# If frameUrl or frameIdPath are provided, narrow the search to that frame
 			def _select_context(pg):
 				try:
-					from playwright.async_api import Page, Frame
-					ctx: Page | Frame = pg
+					ctx = pg
 					# If frame hints point to top document, stay on page
 					fid = getattr(params, 'frameIdPath', None)
 					furl = getattr(params, 'frameUrl', None)
@@ -99,7 +98,9 @@ def _select_context(pg):
 						segs = [s for s in str(fid).split('.') if s != '']
 						if all(s == '0' for s in segs):
 							return pg
-						f = pg.main_frame
+						f = getattr(pg, 'main_frame', None)
+						if not f:
+							return pg
 						for s in segs[1:]:  # skip top marker
 							idx = int(s)
 							if 0 <= idx < len(f.child_frames):
@@ -116,7 +117,7 @@ def _select_context(pg):
 							return pg
 					except Exception:
 						pass
-					for fr in pg.frames:
+					for fr in getattr(pg, 'frames', []):
 						try:
 							ff = urlparse(fr.url)
 							if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl):
@@ -131,7 +132,7 @@ def _select_context(pg):
 			async def _find_in_frames(pg, selector: str):
 				prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or ''
 				pref_o = urlparse(prefer) if prefer else None
-				frames = list(pg.frames)
+				frames = list(getattr(pg, 'frames', []))
 				def score(fr):
 					if not pref_o:
 						return 0

From 7c4a0cc583291c10b37cbaed16632172ae2e00cd Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <118588477+Diveyam-Mishra@users.noreply.github.com>
Date: Mon, 27 Oct 2025 01:24:05 +0530
Subject: [PATCH 12/13] Update workflows/workflow_use/controller/service.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 workflows/workflow_use/controller/service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 470fe804..39ee6f74 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -184,7 +184,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse
 		@self.registry.action('Scroll page', param_model=ScrollDeterministicAction)
 		async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult:
 			page = await browser_session.must_get_current_page()
-			await page.evaluate('(x, y) => { window.scrollBy(x, y); return ""; }', params.scrollX, params.scrollY)
+			await page.evaluate('(x, y) => { window.scrollBy(x, y); }', params.scrollX, params.scrollY)
 			msg = f'Scrolled page by (x={params.scrollX}, y={params.scrollY})'
 			logger.info(msg)
 			return ActionResult(extracted_content=msg, include_in_memory=True)

From 1c377b34e527a5fbe671a150ad18818cdaf380c9 Mon Sep 17 00:00:00 2001
From: Diveyam Mishra <118588477+Diveyam-Mishra@users.noreply.github.com>
Date: Mon, 27 Oct 2025 01:35:54 +0530
Subject: [PATCH 13/13] Update workflows/workflow_use/controller/service.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 workflows/workflow_use/controller/service.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py
index 39ee6f74..7395d7fd 100644
--- a/workflows/workflow_use/controller/service.py
+++ b/workflows/workflow_use/controller/service.py
@@ -184,7 +184,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse
 		@self.registry.action('Scroll page', param_model=ScrollDeterministicAction)
 		async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult:
 			page = await browser_session.must_get_current_page()
-			await page.evaluate('(x, y) => { window.scrollBy(x, y); }', params.scrollX, params.scrollY)
+			await page.evaluate('([x, y]) => { window.scrollBy(x, y); }', [params.scrollX, params.scrollY])
 			msg = f'Scrolled page by (x={params.scrollX}, y={params.scrollY})'
 			logger.info(msg)
 			return ActionResult(extracted_content=msg, include_in_memory=True)