From ab941a2089f31b623fa3930799705446cae1d6ff Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 03:00:16 +0530 Subject: [PATCH 01/13] New tab clicks (target=_blank / modifier clicks) stalled recording; child tab steps missing. Background/ad/tracker tabs polluted logs. Excessive duplicate navigation events per redirect/loading cycle. Massive explosion of input steps (hundreds of empty, unchanged values). Unnecessary workflow updates when steps unchanged. New Tab Intent Heuristic: Content script emits PREPARE_NEW_TAB on ctrl/cmd/middle click or target=_blank. Background correlates upcoming chrome.tabs.onCreated to mark userInitiated. Activated tabs tracked; only activated or userInitiated tabs produce steps. Tab Filtering: Suppress all events (except activation) from tabs never activated and not correlated with an intent window (4s). Reduces noise from ads/trackers. Navigation Consolidation: Maintain lastNavigationIndexByTab; update existing navigation step instead of appending duplicates during rapid redirects or title/url churn. Input Event Deduplication: Content script: per-xpath cache; skip unchanged value; debounce; skip rapid empty repeats. Background: merge consecutive identical field edits; collapse bursts of empty values within 5s (timestamp refresh only). Track lastInputPerKey (tabId|xpath) to decide merge vs new step. --- extension/src/entrypoints/background.ts | 222 ++++++++++++++---------- extension/src/entrypoints/content.ts | 44 +++++ 2 files changed, 176 insertions(+), 90 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 283a16f4..50c05309 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -30,6 +30,19 @@ export default defineBackground(() => { // Store tab information (URL, potentially title) const tabInfo: { [tabId: number]: { url?: string; title?: string } } = {}; + // Track which tabs have been explicitly activated (brought to foreground) by the user. + // We will ignore events originating from tabs that were never activated to reduce noise + // (for example: ad / tracker tabs that load in the background). + const activatedTabs = new Set(); + + // Track user clicks that are likely to open a new tab (Ctrl/Cmd + click, target=_blank etc.). + // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate + // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated. + const recentNewTabIntents: { [openerTabId: number]: number } = {}; + + // Heuristic window (ms) within which a created tab following a user intent is considered relevant. + const NEW_TAB_INTENT_WINDOW_MS = 4000; + let isRecordingEnabled = true; // Default to disabled (OFF) let lastWorkflowHash: string | null = null; // Cache for the last logged workflow hash @@ -144,6 +157,16 @@ export default defineBackground(() => { console.log(`Sending ${type}:`, payload); const tabId = payload.tabId; if (tabId) { + // Skip capturing events for tabs that have never been activated AND are not the original opener + // unless we have positively identified them as a recent user initiated tab (click intent -> creation). + if ( + type !== "CUSTOM_TAB_ACTIVATED" && + !activatedTabs.has(tabId) && + !(payload.openerTabId && recentNewTabIntents[payload.openerTabId] && Date.now() - recentNewTabIntents[payload.openerTabId] < NEW_TAB_INTENT_WINDOW_MS) + ) { + // Silently ignore background noise (ad/tracker tabs) until user actually focuses them. + return; + } if (!sessionLogs[tabId]) { sessionLogs[tabId] = []; } @@ -171,6 +194,12 @@ export default defineBackground(() => { url: tab.pendingUrl || tab.url, windowId: tab.windowId, index: tab.index, + userInitiated: + !!( + tab.openerTabId && + recentNewTabIntents[tab.openerTabId] && + Date.now() - recentNewTabIntents[tab.openerTabId] < NEW_TAB_INTENT_WINDOW_MS + ), }); }); @@ -188,6 +217,7 @@ export default defineBackground(() => { }); chrome.tabs.onActivated.addListener((activeInfo) => { + activatedTabs.add(activeInfo.tabId); sendTabEvent("CUSTOM_TAB_ACTIVATED", { tabId: activeInfo.tabId, windowId: activeInfo.windowId, @@ -214,64 +244,90 @@ export default defineBackground(() => { function convertStoredEventsToSteps(events: StoredEvent[]): Step[] { const steps: Step[] = []; + const lastNavigationIndexByTab: Record = {}; + const lastInputPerKey: Record = {}; for (const event of events) { switch (event.messageType) { - case "CUSTOM_CLICK_EVENT": { - const clickEvent = event as StoredCustomClickEvent; - // Ensure required fields are present, even if optional in source type for some reason + case "CUSTOM_TAB_CREATED": + case "CUSTOM_TAB_UPDATED": + case "CUSTOM_TAB_ACTIVATED": { + const navUrl = (event as any).url || (event as any).changeInfo?.url; + if (!navUrl) break; + const tabId = (event as any).tabId; + const userInitiated = (event as any).userInitiated; + if (!activatedTabs.has(tabId) && !userInitiated) break; // suppress background noise + + const existingIdx = lastNavigationIndexByTab[tabId]; if ( - clickEvent.url && - clickEvent.frameUrl && - clickEvent.xpath && - clickEvent.elementTag + existingIdx !== undefined && + steps[existingIdx] && + steps[existingIdx].type === "navigation" ) { + // Update existing navigation (redirect / title change) + (steps[existingIdx] as NavigationStep).url = navUrl; + steps[existingIdx].timestamp = event.timestamp; + } else { + const nav: NavigationStep = { + type: "navigation", + timestamp: event.timestamp, + tabId, + url: navUrl, + }; + steps.push(nav); + lastNavigationIndexByTab[tabId] = steps.length - 1; + } + break; + } + case "CUSTOM_CLICK_EVENT": { + const click = event as StoredCustomClickEvent; + if (click.url && click.xpath && click.elementTag) { const step: ClickStep = { type: "click", - timestamp: clickEvent.timestamp, - tabId: clickEvent.tabId, - url: clickEvent.url, - frameUrl: clickEvent.frameUrl, - xpath: clickEvent.xpath, - cssSelector: clickEvent.cssSelector, - elementTag: clickEvent.elementTag, - elementText: clickEvent.elementText, - screenshot: clickEvent.screenshot, + timestamp: click.timestamp, + tabId: click.tabId, + url: click.url, + frameUrl: click.frameUrl, + xpath: click.xpath, + cssSelector: click.cssSelector, + elementTag: click.elementTag, + elementText: click.elementText, + screenshot: click.screenshot, }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_CLICK_EVENT:", clickEvent); + console.warn("Skipping incomplete CUSTOM_CLICK_EVENT", click); } break; } - case "CUSTOM_INPUT_EVENT": { const inputEvent = event as StoredCustomInputEvent; - if ( - inputEvent.url && - // inputEvent.frameUrl && // frameUrl might be null/undefined in some cases, let's allow merging if only one is present or both match - inputEvent.xpath && - inputEvent.elementTag - ) { + if (inputEvent.url && inputEvent.xpath && inputEvent.elementTag) { + const key = `${inputEvent.tabId}|${inputEvent.xpath}`; + const prior = lastInputPerKey[key]; + const nowTs = inputEvent.timestamp; + const isEmpty = (inputEvent as any).value === ""; + if (isEmpty && prior && prior.value === "" && nowTs - prior.ts < 5000) { + // collapse rapid-fire repeated empties + steps[prior.idx].timestamp = nowTs; + break; + } const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; - - // Check if the last step was a mergeable input event if ( lastStep && lastStep.type === "input" && lastStep.tabId === inputEvent.tabId && lastStep.url === inputEvent.url && - lastStep.frameUrl === inputEvent.frameUrl && // Ensure frameUrls match if both exist + lastStep.frameUrl === inputEvent.frameUrl && lastStep.xpath === inputEvent.xpath && lastStep.cssSelector === inputEvent.cssSelector && lastStep.elementTag === inputEvent.elementTag ) { - // Update the last input step (lastStep as InputStep).value = inputEvent.value; - lastStep.timestamp = inputEvent.timestamp; // Update to latest timestamp - (lastStep as InputStep).screenshot = inputEvent.screenshot; // Update to latest screenshot + lastStep.timestamp = inputEvent.timestamp; + (lastStep as InputStep).screenshot = inputEvent.screenshot; + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } else { - // Add a new input step const newStep: InputStep = { type: "input", timestamp: inputEvent.timestamp, @@ -285,24 +341,22 @@ export default defineBackground(() => { screenshot: inputEvent.screenshot, }; steps.push(newStep); + lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } } else { - console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent); + console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent); } break; } - case "CUSTOM_KEY_EVENT": { const keyEvent = event as StoredCustomKeyEvent; - // Key press might not always have a target element (xpath, etc.) - // but needs at least url and key if (keyEvent.url && keyEvent.key) { const step: KeyPressStep = { type: "key_press", timestamp: keyEvent.timestamp, tabId: keyEvent.tabId, url: keyEvent.url, - frameUrl: keyEvent.frameUrl, // Can be missing + frameUrl: keyEvent.frameUrl, key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -311,77 +365,56 @@ export default defineBackground(() => { }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent); + console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent); } break; } - case "RRWEB_EVENT": { - // We only care about scroll events from rrweb for now const rrEvent = event as StoredRrwebEvent; - if ( - rrEvent.type === EventType.IncrementalSnapshot && - rrEvent.data.source === IncrementalSource.Scroll - ) { - const scrollData = rrEvent.data as { - id: number; - x: number; - y: number; - }; // Type assertion for clarity - const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL - - // Check if the last step added was a mergeable scroll event - const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( - lastStep && - lastStep.type === "scroll" && - lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id + rrEvent.type === EventType.IncrementalSnapshot && + rrEvent.data.source === IncrementalSource.Scroll ) { - // Update the last scroll step - (lastStep as ScrollStep).scrollX = scrollData.x; - (lastStep as ScrollStep).scrollY = scrollData.y; - lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp - // URL should already be set from the first event in the sequence - } else { - // Add a new scroll step - const newStep: ScrollStep = { - type: "scroll", + const scrollData = rrEvent.data as { id: number; x: number; y: number }; + const currentTabInfo = tabInfo[rrEvent.tabId]; + const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; + if ( + lastStep && + lastStep.type === "scroll" && + lastStep.tabId === rrEvent.tabId && + (lastStep as ScrollStep).targetId === scrollData.id + ) { + (lastStep as ScrollStep).scrollX = scrollData.x; + (lastStep as ScrollStep).scrollY = scrollData.y; + lastStep.timestamp = rrEvent.timestamp; + } else { + const scrollStep: ScrollStep = { + type: "scroll", + timestamp: rrEvent.timestamp, + tabId: rrEvent.tabId, + targetId: scrollData.id, + scrollX: scrollData.x, + scrollY: scrollData.y, + url: currentTabInfo?.url, + }; + steps.push(scrollStep); + } + } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { + const metaData = rrEvent.data as { href: string }; + const nav: NavigationStep = { + type: "navigation", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, - targetId: scrollData.id, - scrollX: scrollData.x, - scrollY: scrollData.y, - url: currentTabInfo?.url, // Add URL if available + url: metaData.href, }; - steps.push(newStep); + steps.push(nav); } - } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { - // Also handle rrweb meta events as navigation - const metaData = rrEvent.data as { href: string }; - const step: NavigationStep = { - type: "navigation", - timestamp: rrEvent.timestamp, - tabId: rrEvent.tabId, - url: metaData.href, - }; - steps.push(step); - } break; } - - // Add cases for other StoredEvent types to Step types if needed - // e.g., CUSTOM_SELECT_EVENT -> SelectStep - // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep - // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed. - default: - // Ignore other event types for now - // console.log("Ignoring event type:", event.messageType); break; } } - return steps; } @@ -396,6 +429,8 @@ export default defineBackground(() => { "CUSTOM_INPUT_EVENT", "CUSTOM_SELECT_EVENT", "CUSTOM_KEY_EVENT", + // Synthetic event we will emit from content script just before an expected new tab open. + "PREPARE_NEW_TAB", ]; if ( message.type === "RRWEB_EVENT" || @@ -412,6 +447,13 @@ export default defineBackground(() => { const tabId = sender.tab.id; const isCustomEvent = customEventTypes.includes(message.type); + // Record intent for new tab opening to correlate with onCreated event. + if (message.type === "PREPARE_NEW_TAB") { + recentNewTabIntents[sender.tab.id] = Date.now(); + // We do not store this as a workflow step; it's only heuristic metadata. + return false; + } + // Function to store the event const storeEvent = (eventPayload: any, screenshotDataUrl?: string) => { if (!sessionLogs[tabId]) { diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts index 8f7f233f..a1a60231 100644 --- a/extension/src/entrypoints/content.ts +++ b/extension/src/entrypoints/content.ts @@ -240,6 +240,25 @@ function handleCustomClick(event: MouseEvent) { const targetElement = event.target as HTMLElement; if (!targetElement) return; + try { + // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation. + // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener. + const isMiddle = event.button === 1; + const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux) + let anchorOpensNew = false; + if (targetElement instanceof HTMLAnchorElement) { + const a = targetElement as HTMLAnchorElement; + anchorOpensNew = + (a.target && a.target.toLowerCase() === "_blank") || + a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer"); + } + if (isMiddle || isModifier || anchorOpensNew) { + chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } }); + } + } catch (e) { + console.warn("Failed to evaluate new tab heuristic", e); + } + try { const xpath = getXPath(targetElement); const clickData = { @@ -263,12 +282,17 @@ function handleCustomClick(event: MouseEvent) { // --- End Custom Click Handler --- // --- Custom Input Handler --- +// Maintain last recorded value & timestamp per element (keyed by xpath) to suppress noisy repeats +const lastInputRecord: Record = {}; function handleInput(event: Event) { if (!isRecordingActive) return; const targetElement = event.target as HTMLInputElement | HTMLTextAreaElement; if (!targetElement || !("value" in targetElement)) return; const isPassword = targetElement.type === "password"; + // Ignore programmatic (non user-trusted) input events – these often cause massive duplication + if (!(event as InputEvent).isTrusted) return; + try { const xpath = getXPath(targetElement); const inputData = { @@ -280,6 +304,26 @@ function handleInput(event: Event) { elementTag: targetElement.tagName, value: isPassword ? "********" : targetElement.value, }; + + // Dedupe rule 1: If value unchanged for this element and within debounce window, skip + const DEBOUNCE_MS_INPUT = 1500; + const prev = lastInputRecord[xpath]; + if (prev && prev.value === inputData.value && inputData.timestamp - prev.ts < DEBOUNCE_MS_INPUT) { + return; // Suppress noisy duplicate + } + + // Dedupe rule 2: If value is empty string and we already recorded empty in last 5s, suppress further empties + if ( + inputData.value === "" && + prev && + prev.value === "" && + inputData.timestamp - prev.ts < 5000 + ) { + return; + } + + // Store/update last record metadata + lastInputRecord[xpath] = { value: inputData.value, ts: inputData.timestamp }; console.log("Sending CUSTOM_INPUT_EVENT:", inputData); chrome.runtime.sendMessage({ type: "CUSTOM_INPUT_EVENT", From c9d6dec933e45cb2a5dec4330cfaa9379b48f601 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 03:11:20 +0530 Subject: [PATCH 02/13] Does no t record duplicate steps just because of different mouse positions --- extension/src/entrypoints/background.ts | 155 ++++++++++++++++++------ 1 file changed, 119 insertions(+), 36 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 50c05309..ad039b04 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -79,13 +79,72 @@ export default defineBackground(() => { // Function to broadcast workflow data updates to the console bus async function broadcastWorkflowDataUpdate(): Promise { // console.log("[DEBUG] broadcastWorkflowDataUpdate: Entered function"); // Optional: Keep for debugging - const allSteps: Step[] = Object.keys(sessionLogs) + const rawSteps: Step[] = Object.keys(sessionLogs) .flatMap((tabIdStr) => { const tabId = parseInt(tabIdStr, 10); return convertStoredEventsToSteps(sessionLogs[tabId] || []); }) .sort((a, b) => a.timestamp - b.timestamp); // Sort chronologically + // Post-process to collapse consecutive duplicates that only differ by timestamp (e.g. repeated identical navigations) + const allSteps: Step[] = []; + for (const step of rawSteps) { + const last = allSteps.length ? allSteps[allSteps.length - 1] : null; + if (!last) { + allSteps.push(step); + continue; + } + let isDuplicate = false; + if (last.type === step.type) { + switch (step.type) { + case 'navigation': + isDuplicate = (last as NavigationStep).url === (step as NavigationStep).url && last.tabId === step.tabId; + break; + case 'input': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).value === (step as any).value; + break; + case 'click': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).frameUrl === (step as any).frameUrl && + (last as any).xpath === (step as any).xpath && + (last as any).elementTag === (step as any).elementTag && + (last as any).elementText === (step as any).elementText; + break; + case 'scroll': + isDuplicate = + last.tabId === step.tabId && + (last as any).targetId === (step as any).targetId && + (last as any).scrollX === (step as any).scrollX && + (last as any).scrollY === (step as any).scrollY; + break; + case 'key_press': + isDuplicate = + last.tabId === step.tabId && + (last as any).url === (step as any).url && + (last as any).key === (step as any).key && + (last as any).xpath === (step as any).xpath; + break; + } + } + if (isDuplicate) { + // Update timestamp (and screenshot if present) to most recent but don't add new step + last.timestamp = step.timestamp; + if ((step as any).screenshot) { + (last as any).screenshot = (step as any).screenshot; + } + } else { + allSteps.push(step); + } + } + // Create the workflowData object *after* sorting steps, but hash only steps const workflowData: Workflow = { name: "Recorded Workflow", @@ -344,19 +403,22 @@ export default defineBackground(() => { lastInputPerKey[key] = { idx: steps.length - 1, ts: nowTs, value: (inputEvent as any).value }; } } else { - console.warn("Skipping incomplete CUSTOM_INPUT_EVENT", inputEvent); + console.warn("Skipping incomplete CUSTOM_INPUT_EVENT:", inputEvent); } break; } + case "CUSTOM_KEY_EVENT": { const keyEvent = event as StoredCustomKeyEvent; + // Key press might not always have a target element (xpath, etc.) + // but needs at least url and key if (keyEvent.url && keyEvent.key) { const step: KeyPressStep = { type: "key_press", timestamp: keyEvent.timestamp, tabId: keyEvent.tabId, url: keyEvent.url, - frameUrl: keyEvent.frameUrl, + frameUrl: keyEvent.frameUrl, // Can be missing key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -365,56 +427,77 @@ export default defineBackground(() => { }; steps.push(step); } else { - console.warn("Skipping incomplete CUSTOM_KEY_EVENT", keyEvent); + console.warn("Skipping incomplete CUSTOM_KEY_EVENT:", keyEvent); } break; } + case "RRWEB_EVENT": { + // We only care about scroll events from rrweb for now const rrEvent = event as StoredRrwebEvent; + if ( + rrEvent.type === EventType.IncrementalSnapshot && + rrEvent.data.source === IncrementalSource.Scroll + ) { + const scrollData = rrEvent.data as { + id: number; + x: number; + y: number; + }; // Type assertion for clarity + const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL + + // Check if the last step added was a mergeable scroll event + const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( - rrEvent.type === EventType.IncrementalSnapshot && - rrEvent.data.source === IncrementalSource.Scroll + lastStep && + lastStep.type === "scroll" && + lastStep.tabId === rrEvent.tabId && + (lastStep as ScrollStep).targetId === scrollData.id ) { - const scrollData = rrEvent.data as { id: number; x: number; y: number }; - const currentTabInfo = tabInfo[rrEvent.tabId]; - const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; - if ( - lastStep && - lastStep.type === "scroll" && - lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id - ) { - (lastStep as ScrollStep).scrollX = scrollData.x; - (lastStep as ScrollStep).scrollY = scrollData.y; - lastStep.timestamp = rrEvent.timestamp; - } else { - const scrollStep: ScrollStep = { - type: "scroll", - timestamp: rrEvent.timestamp, - tabId: rrEvent.tabId, - targetId: scrollData.id, - scrollX: scrollData.x, - scrollY: scrollData.y, - url: currentTabInfo?.url, - }; - steps.push(scrollStep); - } - } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { - const metaData = rrEvent.data as { href: string }; - const nav: NavigationStep = { - type: "navigation", + // Update the last scroll step + (lastStep as ScrollStep).scrollX = scrollData.x; + (lastStep as ScrollStep).scrollY = scrollData.y; + lastStep.timestamp = rrEvent.timestamp; // Update to latest timestamp + // URL should already be set from the first event in the sequence + } else { + // Add a new scroll step + const newStep: ScrollStep = { + type: "scroll", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, - url: metaData.href, + targetId: scrollData.id, + scrollX: scrollData.x, + scrollY: scrollData.y, + url: currentTabInfo?.url, // Add URL if available }; - steps.push(nav); + steps.push(newStep); } + } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { + // Also handle rrweb meta events as navigation + const metaData = rrEvent.data as { href: string }; + const step: NavigationStep = { + type: "navigation", + timestamp: rrEvent.timestamp, + tabId: rrEvent.tabId, + url: metaData.href, + }; + steps.push(step); + } break; } + + // Add cases for other StoredEvent types to Step types if needed + // e.g., CUSTOM_SELECT_EVENT -> SelectStep + // e.g., CUSTOM_TAB_CREATED -> TabCreatedStep + // RRWEB_EVENT type 4 (Meta) or 3 (FullSnapshot) could potentially map to NavigationStep if needed. + default: + // Ignore other event types for now + // console.log("Ignoring event type:", event.messageType); break; } } + return steps; } From cd9ebc7d7538ab9e8e7ad682c5be116e8c56a11e Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:05:48 +0530 Subject: [PATCH 03/13] works but ui needs to change --- extension/src/entrypoints/background.ts | 79 ++++++++++++++++ extension/src/entrypoints/content.ts | 94 ++++++++++++------- extension/src/entrypoints/options.html | 80 ++++++++++++++++ .../sidepanel/components/recording-view.tsx | 40 +++++++- extension/src/lib/types.ts | 5 + extension/src/lib/workflow-types.ts | 5 + extension/wxt.config.ts | 12 ++- workflows/workflow_use/recorder/service.py | 27 +++++- 8 files changed, 298 insertions(+), 44 deletions(-) create mode 100644 extension/src/entrypoints/options.html diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index ad039b04..6e1ae033 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -39,6 +39,24 @@ export default defineBackground(() => { // Content scripts will send a PREPARE_NEW_TAB signal; we keep timestamp to correlate // shortly following chrome.tabs.onCreated events so we can mark those tabs as user initiated. const recentNewTabIntents: { [openerTabId: number]: number } = {}; + // Record iframe URLs that the user actually interacted with (via custom events) per tab + const interactedFrameUrls: Record> = {}; + // Additionally track last interaction time per frame for time-window gating + const interactedFrameTimes: Record> = {}; + // Hostname patterns for iframe navigation noise we want to suppress + const BLOCKED_IFRAME_HOST_PATTERNS: RegExp[] = [ + /doubleclick\.net$/i, + /googlesyndication\.com$/i, + /googleadservices\.com$/i, + /amazon-adsystem\.com$/i, + /recaptcha\.google\.com$/i, + /recaptcha\.net$/i, + /googletagmanager\.com$/i, + /indexww\.com$/i, + /adtrafficquality\.google$/i, + /2mdn\.net$/i, + /gstaticadssl\.googleapis\.com$/i, + ]; // Heuristic window (ms) within which a created tab following a user intent is considered relevant. const NEW_TAB_INTENT_WINDOW_MS = 4000; @@ -301,6 +319,25 @@ export default defineBackground(() => { // --- Conversion Function --- + const DEFAULT_SETTINGS = { + enableIframes: true as boolean, + iframeWindow: 3000 as number, + blocklist: [ + 'doubleclick.net','googlesyndication.com','googleadservices.com', + 'amazon-adsystem.com','2mdn.net','recaptcha.google.com','recaptcha.net', + 'googletagmanager.com','indexww.com','adtrafficquality.google' + ] as string[], + allowlist: [] as string[], + }; + let settings: { enableIframes: boolean; iframeWindow: number; blocklist: string[]; allowlist: string[] } = { ...DEFAULT_SETTINGS }; + chrome.storage.sync.get(DEFAULT_SETTINGS, (s: any) => { settings = { ...settings, ...s }; }); + chrome.storage.onChanged.addListener((changes, area) => { + if (area !== 'sync') return; + const next = { ...settings } as any; + for (const k of Object.keys(changes)) (next as any)[k] = (changes as any)[k].newValue; + settings = next; + }); + function convertStoredEventsToSteps(events: StoredEvent[]): Step[] { const steps: Step[] = []; const lastNavigationIndexByTab: Record = {}; @@ -347,6 +384,7 @@ export default defineBackground(() => { tabId: click.tabId, url: click.url, frameUrl: click.frameUrl, + frameIdPath: (click as any).frameIdPath, xpath: click.xpath, cssSelector: click.cssSelector, elementTag: click.elementTag, @@ -393,6 +431,7 @@ export default defineBackground(() => { tabId: inputEvent.tabId, url: inputEvent.url, frameUrl: inputEvent.frameUrl, + frameIdPath: (inputEvent as any).frameIdPath, xpath: inputEvent.xpath, cssSelector: inputEvent.cssSelector, elementTag: inputEvent.elementTag, @@ -419,6 +458,7 @@ export default defineBackground(() => { tabId: keyEvent.tabId, url: keyEvent.url, frameUrl: keyEvent.frameUrl, // Can be missing + frameIdPath: (keyEvent as any).frameIdPath, key: keyEvent.key, xpath: keyEvent.xpath, cssSelector: keyEvent.cssSelector, @@ -475,11 +515,43 @@ export default defineBackground(() => { } else if (rrEvent.type === EventType.Meta && rrEvent.data?.href) { // Also handle rrweb meta events as navigation const metaData = rrEvent.data as { href: string }; + const href = metaData.href; + // Drop about:blank always + if (href === 'about:blank') { + break; + } + try { + const urlObj = new URL(href); + const host = urlObj.hostname; + // Allowlist overrides blocklist + const inAllow = settings.allowlist.some(d => host.endsWith(d)); + const inBlock = settings.blocklist.some(d => host.endsWith(d)); + if (!inAllow && inBlock) { + break; + } + if (!settings.enableIframes && !(rrEvent as any).isTopFrame) { + break; // user disabled iframe recording + } + // If top frame, allow + if ((rrEvent as any).isTopFrame) { + // allowed + } else { + const fUrl = (rrEvent as any).frameUrl as string | undefined; + if (!fUrl) break; + const times = interactedFrameTimes[rrEvent.tabId] || {}; + const lastTs = times[fUrl]; + if (!lastTs) break; + if (Date.now() - lastTs > settings.iframeWindow) break; + } + } catch { + break; + } const step: NavigationStep = { type: "navigation", timestamp: rrEvent.timestamp, tabId: rrEvent.tabId, url: metaData.href, + // frameIdPath could be attached if needed }; steps.push(step); } @@ -559,6 +631,13 @@ export default defineBackground(() => { screenshot: screenshotDataUrl, }; sessionLogs[tabId].push(eventWithMeta); + // Mark frame as interacted so subsequent iframe meta navigations can be allowed + if (message.type.startsWith("CUSTOM_") && eventPayload.frameUrl) { + if (!interactedFrameUrls[tabId]) interactedFrameUrls[tabId] = new Set(); + interactedFrameUrls[tabId].add(eventPayload.frameUrl); + if (!interactedFrameTimes[tabId]) interactedFrameTimes[tabId] = {}; + interactedFrameTimes[tabId][eventPayload.frameUrl] = Date.now(); + } broadcastWorkflowDataUpdate(); // Call is async, will not block // console.log(`Stored ${message.type} from tab ${tabId}`); }; diff --git a/extension/src/entrypoints/content.ts b/extension/src/entrypoints/content.ts index a1a60231..a646562f 100644 --- a/extension/src/entrypoints/content.ts +++ b/extension/src/entrypoints/content.ts @@ -122,6 +122,16 @@ function startRecorder() { emit(event) { if (!isRecordingActive) return; + const frameUrl = window.location.href; + const isTopFrame = window.self === window.top; + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); + // Handle scroll events with debouncing and direction detection if ( event.type === EventType.IncrementalSnapshot && @@ -157,7 +167,10 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); lastDirection = currentDirection; @@ -178,15 +191,18 @@ function startRecorder() { type: "RRWEB_EVENT", payload: { ...event, - data: roundedScrollData, // Use rounded coordinates + data: roundedScrollData, + frameUrl, + frameIdPath, + isTopFrame, }, }); scrollTimeout = null; lastDirection = null; // Reset direction for next scroll }, DEBOUNCE_MS); } else { - // Pass through non-scroll events unchanged - chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: event }); + // Pass through non-scroll events unchanged, but include frame context for filtering in background + chrome.runtime.sendMessage({ type: "RRWEB_EVENT", payload: { ...event, frameUrl, frameIdPath, isTopFrame } }); } }, maskInputOptions: { @@ -239,45 +255,38 @@ function handleCustomClick(event: MouseEvent) { if (!isRecordingActive) return; const targetElement = event.target as HTMLElement; if (!targetElement) return; - - try { - // Detect if this click is likely to open a new tab so background can correlate upcoming tab creation. - // Heuristics: modifier key (Ctrl/Cmd / middle button) OR anchor with target=_blank / rel noopener. - const isMiddle = event.button === 1; - const isModifier = event.metaKey || event.ctrlKey; // Cmd (mac) or Ctrl (win/linux) - let anchorOpensNew = false; - if (targetElement instanceof HTMLAnchorElement) { - const a = targetElement as HTMLAnchorElement; - anchorOpensNew = - (a.target && a.target.toLowerCase() === "_blank") || - a.rel.split(/\s+/).some((r) => r.toLowerCase() === "noopener" || r.toLowerCase() === "noreferrer"); - } - if (isMiddle || isModifier || anchorOpensNew) { - chrome.runtime.sendMessage({ type: "PREPARE_NEW_TAB", payload: { reason: "potential_new_tab_click" } }); - } - } catch (e) { - console.warn("Failed to evaluate new tab heuristic", e); - } - + // Determine a frame identifier (best-effort). Top frame = 0, nested frames build path. + const frameIdPath = (() => { + try { + let win: any = window; + const parts: number[] = []; + while (win !== win.parent) { + const parent = win.parent; + let index = 0; + for (let i = 0; i < parent.frames.length; i++) { + if (parent.frames[i] === win) { index = i; break; } + } + parts.unshift(index); + win = parent; + if (parts.length > 10) break; // safety + } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const clickData = { timestamp: Date.now(), - url: document.location.href, // Use document.location for main page URL - frameUrl: window.location.href, // URL of the frame where the event occurred - xpath: xpath, + url: document.location.href, + frameUrl: window.location.href, + frameIdPath, + xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, elementText: targetElement.textContent?.trim().slice(0, 200) || "", }; - console.log("Sending CUSTOM_CLICK_EVENT:", clickData); - chrome.runtime.sendMessage({ - type: "CUSTOM_CLICK_EVENT", - payload: clickData, - }); - } catch (error) { - console.error("Error capturing click data:", error); - } + chrome.runtime.sendMessage({ type: "CUSTOM_CLICK_EVENT", payload: clickData }); + } catch (error) { console.error("Error capturing click data:", error); } } // --- End Custom Click Handler --- @@ -293,12 +302,20 @@ function handleInput(event: Event) { // Ignore programmatic (non user-trusted) input events – these often cause massive duplication if (!(event as InputEvent).isTrusted) return; + const frameIdPath = (() => { + try { + let win: any = window; const parts: number[] = []; + while (win !== win.parent) { const parent = win.parent; let idx=0; for (let i=0;i10) break; } + return parts.length ? parts.join('.') : '0'; + } catch { return '0'; } + })(); try { const xpath = getXPath(targetElement); const inputData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, @@ -341,6 +358,7 @@ function handleSelectChange(event: Event) { const targetElement = event.target as HTMLSelectElement; // Ensure it's a select element if (!targetElement || targetElement.tagName !== "SELECT") return; + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const xpath = getXPath(targetElement); @@ -349,6 +367,7 @@ function handleSelectChange(event: Event) { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, xpath: xpath, cssSelector: getEnhancedCSSSelector(targetElement, xpath), elementTag: targetElement.tagName, @@ -421,11 +440,13 @@ function handleKeydown(event: KeyboardEvent) { } } + const frameIdPath = (() => { try { let win:any=window; const parts:number[]=[]; while(win!==win.parent){const parent=win.parent; let idx=0; for(let i=0;i10) break;} return parts.length?parts.join('.'):'0'; } catch { return '0'; } })(); try { const keyData = { timestamp: Date.now(), url: document.location.href, frameUrl: window.location.href, + frameIdPath, key: keyToLog, // The key or combination pressed xpath: xpath, // XPath of the element in focus (if any) cssSelector: cssSelector, // CSS selector of the element in focus (if any) @@ -588,6 +609,9 @@ function handleBlur(event: FocusEvent) { export default defineContentScript({ matches: [""], + // Ensure injection into all frames (iframes) so we can capture interactions inside nested documents. + allFrames: true, + matchAboutBlank: true, main(ctx) { // Listener for status updates from the background script chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { diff --git a/extension/src/entrypoints/options.html b/extension/src/entrypoints/options.html new file mode 100644 index 00000000..dfb7bf49 --- /dev/null +++ b/extension/src/entrypoints/options.html @@ -0,0 +1,80 @@ + + + + + Workflow Use - Options + + + +

Recording Settings

+
+
+ +
When disabled, iframe-originated navigation/meta events are ignored.
+
+
+ + +
Time after a user interaction in an iframe during which rrweb meta navigations are allowed.
+
+
+ + +
+
+ + +
+ + +
+ + + + diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx index 0fa3456a..0aa09f33 100644 --- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx +++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx @@ -1,12 +1,25 @@ import React from "react"; import { useWorkflow } from "../context/workflow-provider"; import { Button } from "@/components/ui/button"; -import { EventViewer } from "./event-viewer"; // Import EventViewer +import { EventViewer } from "./event-viewer"; export const RecordingView: React.FC = () => { const { stopRecording, workflow } = useWorkflow(); const stepCount = workflow?.steps?.length || 0; + const openOptions = () => chrome.runtime.openOptionsPage(); + const [enableIframes, setEnableIframes] = React.useState(true); + React.useEffect(() => { + chrome.storage.sync.get({ enableIframes: true }, (s) => + setEnableIframes(!!s.enableIframes) + ); + }, []); + const toggleIframes = async () => { + const next = !enableIframes; + setEnableIframes(next); + await chrome.storage.sync.set({ enableIframes: next }); + }; + return (
@@ -19,12 +32,29 @@ export const RecordingView: React.FC = () => { Recording ({stepCount} steps)
- +
+ + + +
- {/* EventViewer will now take full available space within this div */}
diff --git a/extension/src/lib/types.ts b/extension/src/lib/types.ts index 601d3aac..8cc94232 100644 --- a/extension/src/lib/types.ts +++ b/extension/src/lib/types.ts @@ -2,6 +2,7 @@ export interface StoredCustomClickEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -15,6 +16,7 @@ export interface StoredCustomInputEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -28,6 +30,7 @@ export interface StoredCustomSelectEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; elementTag: string; @@ -42,6 +45,7 @@ export interface StoredCustomKeyEvent { timestamp: number; url: string; frameUrl: string; + frameIdPath?: string; key: string; xpath?: string; // XPath of focused element cssSelector?: string; @@ -73,6 +77,7 @@ export interface StoredRrwebEvent { data: any; timestamp: number; tabId: number; + frameUrl?: string; // URL of the frame where the rrweb event originated (when injected per-frame) messageType: "RRWEB_EVENT"; } diff --git a/extension/src/lib/workflow-types.ts b/extension/src/lib/workflow-types.ts index 57d4470f..35605bd6 100644 --- a/extension/src/lib/workflow-types.ts +++ b/extension/src/lib/workflow-types.ts @@ -27,12 +27,14 @@ export interface NavigationStep extends BaseStep { type: "navigation"; url: string; // Navigation implies a URL change screenshot?: string; // Optional in source + frameIdPath?: string; // Optional frame context } export interface ClickStep extends BaseStep { type: "click"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -44,6 +46,7 @@ export interface InputStep extends BaseStep { type: "input"; url: string; frameUrl: string; + frameIdPath?: string; xpath: string; cssSelector?: string; // Optional in source elementTag: string; @@ -55,6 +58,7 @@ export interface KeyPressStep extends BaseStep { type: "key_press"; url?: string; // Can be missing if key press happens without element focus? Source is optional. frameUrl?: string; // Might be missing + frameIdPath?: string; key: string; xpath?: string; // Optional in source cssSelector?: string; // Optional in source @@ -67,6 +71,7 @@ export interface ScrollStep extends BaseStep { targetId: number; // The rrweb ID of the element being scrolled scrollX: number; scrollY: number; + frameIdPath?: string; // Note: url might be missing if scroll happens on initial load before meta event? } diff --git a/extension/wxt.config.ts b/extension/wxt.config.ts index 9d7ae4ec..83bf64d8 100644 --- a/extension/wxt.config.ts +++ b/extension/wxt.config.ts @@ -13,9 +13,15 @@ export default defineConfig({ // WXT-specific overrides (optional) }), manifest: { - permissions: ["tabs", "sidePanel", ""], - host_permissions: ["http://127.0.0.1/*"], - // options_page: "options.html", + permissions: ["tabs", "sidePanel", "storage", ""], + // Broaden host permissions so content script can inject into iframes on external sites. + // Note: in permissions allows some access, but host_permissions explicitly grants injection rights. + host_permissions: [ + "http://127.0.0.1/*", + "https://*/*", + "http://*/*" + ], + options_page: "options.html", // action: { // default_popup: "popup.html", // }, diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py index fb154ba9..5de21556 100644 --- a/workflows/workflow_use/recorder/service.py +++ b/workflows/workflow_use/recorder/service.py @@ -85,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): async with self.final_workflow_processed_lock: if not self.final_workflow_processed_flag and self.last_workflow_update_event: print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).') - self.final_workflow_output = self.last_workflow_update_event.payload + wf = self.last_workflow_update_event.payload + # Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations + try: + clean_steps = [] + for s in wf.steps: + st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None) + url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None) + if st == 'navigation': + if not url or url == 'about:blank': + continue + from urllib.parse import urlparse + host = urlparse(url).hostname or '' + blocked = any( + pat in host for pat in ( + 'doubleclick.net', 'googlesyndication.com', 'googleadservices.com', + 'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net', + 'googletagmanager.com', 'indexww.com', 'adtrafficquality.google' + ) + ) + if blocked: + continue + clean_steps.append(s) + wf.steps = clean_steps + except Exception as e: + print(f'[Service] Backend filter failed: {e}') + self.final_workflow_output = wf self.final_workflow_processed_flag = True processed_this_call = True From cb846efe170a7c4da47a9d9051506b827de1442d Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:23:07 +0530 Subject: [PATCH 04/13] final recording steps completed frontend side --- extension/src/entrypoints/background.ts | 23 ++++++++++----- .../sidepanel/components/recording-view.tsx | 29 ------------------- 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/extension/src/entrypoints/background.ts b/extension/src/entrypoints/background.ts index 6e1ae033..bf8d49ea 100644 --- a/extension/src/entrypoints/background.ts +++ b/extension/src/entrypoints/background.ts @@ -136,13 +136,13 @@ export default defineBackground(() => { (last as any).elementTag === (step as any).elementTag && (last as any).elementText === (step as any).elementText; break; - case 'scroll': - isDuplicate = - last.tabId === step.tabId && - (last as any).targetId === (step as any).targetId && - (last as any).scrollX === (step as any).scrollX && - (last as any).scrollY === (step as any).scrollY; + case 'scroll': { + const sameXY = (last as any).scrollX === (step as any).scrollX && (last as any).scrollY === (step as any).scrollY; + const sameUrl = (last as any).url === (step as any).url; + const nearTime = Math.abs(step.timestamp - last.timestamp) < 200; + isDuplicate = last.tabId === step.tabId && sameXY && sameUrl && nearTime; break; + } case 'key_press': isDuplicate = last.tabId === step.tabId && @@ -485,14 +485,21 @@ export default defineBackground(() => { y: number; }; // Type assertion for clarity const currentTabInfo = tabInfo[rrEvent.tabId]; // Get associated tab info for URL - + // Drop internal chrome pages like chrome://newtab/ + if (currentTabInfo?.url?.startsWith('chrome://')) { + break; + } // Check if the last step added was a mergeable scroll event const lastStep = steps.length > 0 ? steps[steps.length - 1] : null; if ( lastStep && lastStep.type === "scroll" && lastStep.tabId === rrEvent.tabId && - (lastStep as ScrollStep).targetId === scrollData.id + // Treat same XY within a short time window as duplicate, regardless of targetId + (lastStep as ScrollStep).scrollX === scrollData.x && + (lastStep as ScrollStep).scrollY === scrollData.y && + Math.abs(rrEvent.timestamp - lastStep.timestamp) < 200 && + (lastStep as any).url === currentTabInfo?.url ) { // Update the last scroll step (lastStep as ScrollStep).scrollX = scrollData.x; diff --git a/extension/src/entrypoints/sidepanel/components/recording-view.tsx b/extension/src/entrypoints/sidepanel/components/recording-view.tsx index 0aa09f33..1aa8a0de 100644 --- a/extension/src/entrypoints/sidepanel/components/recording-view.tsx +++ b/extension/src/entrypoints/sidepanel/components/recording-view.tsx @@ -7,19 +7,6 @@ export const RecordingView: React.FC = () => { const { stopRecording, workflow } = useWorkflow(); const stepCount = workflow?.steps?.length || 0; - const openOptions = () => chrome.runtime.openOptionsPage(); - const [enableIframes, setEnableIframes] = React.useState(true); - React.useEffect(() => { - chrome.storage.sync.get({ enableIframes: true }, (s) => - setEnableIframes(!!s.enableIframes) - ); - }, []); - const toggleIframes = async () => { - const next = !enableIframes; - setEnableIframes(next); - await chrome.storage.sync.set({ enableIframes: next }); - }; - return (
@@ -33,22 +20,6 @@ export const RecordingView: React.FC = () => {
- - From 1856775978b152928dda3835ad049aba687cb43a Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Mon, 11 Aug 2025 04:40:41 +0530 Subject: [PATCH 05/13] Added interaction with iframe from backend now needs tocontain the click in iframe itself --- workflows/workflow_use/controller/service.py | 43 +++++++++++++++++++- workflows/workflow_use/controller/views.py | 2 + workflows/workflow_use/workflow/service.py | 19 ++++++--- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 238fcbb5..f4c54d4d 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -DEFAULT_ACTION_TIMEOUT_MS = 1000 +DEFAULT_ACTION_TIMEOUT_MS = 2500 # List of default actions from browser_use.controller.service.Controller to disable # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case) @@ -83,9 +83,48 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse page = await browser_session.get_current_page() original_selector = params.cssSelector + # If frameUrl or frameIdPath are provided, narrow the search to that frame + def _select_context(pg): + try: + from playwright.async_api import Page, Frame + ctx: Page | Frame = pg + if getattr(params, 'frameIdPath', None): + parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != ''] + f = pg.main_frame + for seg in parts: + idx = int(seg) + if idx < len(f.child_frames): + f = f.child_frames[idx] + else: + return ctx + ctx = f + elif getattr(params, 'frameUrl', None): + from urllib.parse import urlparse + pf = urlparse(params.frameUrl) + for fr in pg.frames: + try: + ff = urlparse(fr.url) + # Match origin, and allow target frameUrl to be a prefix of full URL + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl): + ctx = fr + break + except Exception: + continue + except Exception: + ctx = pg + return ctx + try: + # If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM) + curr = (page.url or '').split('#')[0] + tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0] + if tgt and tgt.startswith('http') and curr != tgt: + await page.goto(tgt) + await page.wait_for_load_state() + + ctx = _select_context(page) locator, selector_used = await get_best_element_handle( - page, + ctx, params.cssSelector, params, timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py index 8c61470a..ffb25b4a 100644 --- a/workflows/workflow_use/controller/views.py +++ b/workflows/workflow_use/controller/views.py @@ -23,6 +23,8 @@ class RecorderBase(StepMeta): elementTag: Optional[str] = None elementText: Optional[str] = None frameUrl: Optional[str] = None + frameIdPath: Optional[str] = None + url: Optional[str] = None screenshot: Optional[str] = None diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index aa7008c1..d21da2e8 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -144,11 +144,20 @@ def truncate_selector(selector: str) -> str: await self.browser._wait_for_stable_network() page = await self.browser.get_current_page() - logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') - locator, selector_used = await get_best_element_handle( - page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT - ) - logger.info(f'Element with selector found: {truncate_selector(selector_used)}') + # If the next step declares a URL/frameUrl and it does not match the current page URL, + # skip waiting for its element on the current page (prevents false failures like step 7). + curr_url = (page.url or '').split('#')[0] + declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0] + if declared_next_url and declared_next_url != curr_url: + logger.info( + f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})" + ) + else: + logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') + locator, selector_used = await get_best_element_handle( + page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT + ) + logger.info(f'Element with selector found: {truncate_selector(selector_used)}') except Exception as e: logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}') raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e From 2ccea941462751b0406981ee802c4bb0620ca526 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Thu, 14 Aug 2025 00:12:17 +0530 Subject: [PATCH 06/13] Backend Perfectly clicks on iframe using iterative approach --- workflows/workflow_use/controller/service.py | 96 +++++++++++++++----- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index f4c54d4d..2465d8cc 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -88,50 +88,96 @@ def _select_context(pg): try: from playwright.async_api import Page, Frame ctx: Page | Frame = pg - if getattr(params, 'frameIdPath', None): - parts = [p for p in str(params.frameIdPath).split('.') if p != '0' and p != ''] + # If frame hints point to top document, stay on page + fid = getattr(params, 'frameIdPath', None) + furl = getattr(params, 'frameUrl', None) + curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else '' + if furl and curr_url and furl.split('#')[0] == curr_url: + return pg + if fid: + segs = [s for s in str(fid).split('.') if s != ''] + if all(s == '0' for s in segs): + return pg f = pg.main_frame - for seg in parts: - idx = int(seg) - if idx < len(f.child_frames): + for s in segs[1:]: # skip top marker + idx = int(s) + if 0 <= idx < len(f.child_frames): f = f.child_frames[idx] else: - return ctx - ctx = f - elif getattr(params, 'frameUrl', None): + return pg + return f + if furl: from urllib.parse import urlparse - pf = urlparse(params.frameUrl) + pf = urlparse(furl) + # If frameUrl equals current page URL (origin+path), stay on page + try: + from urllib.parse import urlparse as _u + cu = _u(curr_url) + if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): + return pg + except Exception: + pass for fr in pg.frames: try: ff = urlparse(fr.url) - # Match origin, and allow target frameUrl to be a prefix of full URL - if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(params.frameUrl): - ctx = fr - break + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): + return fr except Exception: continue except Exception: - ctx = pg + return pg return ctx + # Fallback: search all frames for selector (prefer frames matching target origin) + async def _find_in_frames(pg, selector: str): + from urllib.parse import urlparse + prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' + pref_o = urlparse(prefer) if prefer else None + frames = list(pg.frames) + def score(fr): + if not pref_o: + return 0 + try: + fo = urlparse(fr.url) + return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0 + except Exception: + return 0 + frames.sort(key=score, reverse=True) + for fr in frames: + try: + loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2)) + return fr, loc, used + except Exception: + continue + return None, None, None + try: - # If the step declares a URL and it's different from current, navigate first (minimum action to reach expected DOM) + # Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared curr = (page.url or '').split('#')[0] - tgt = (getattr(params, 'url', None) or getattr(params, 'frameUrl', None) or '').split('#')[0] - if tgt and tgt.startswith('http') and curr != tgt: - await page.goto(tgt) + declared_url = (getattr(params, 'url', None) or '').split('#')[0] + has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None)) + if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url: + await page.goto(declared_url) await page.wait_for_load_state() ctx = _select_context(page) - locator, selector_used = await get_best_element_handle( - ctx, - params.cssSelector, - params, - timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, - ) + try: + locator, selector_used = await get_best_element_handle( + ctx, + params.cssSelector, + params, + timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, + ) + except Exception: + # Fallback: search all frames + fr, locator, selector_used = await _find_in_frames(page, params.cssSelector) + if locator is None: + raise + await locator.click(force=True) - msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})' + used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector + msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: From 3ec843fccee7212829a20369a18612b803ab6610 Mon Sep 17 00:00:00 2001 From: Diveyam-Mishra Date: Thu, 14 Aug 2025 00:45:17 +0530 Subject: [PATCH 07/13] Testing Script Added --- .github/copilot-instructions.md | 62 ++++++++++++++++ workflows/examples/test_iframes.json | 101 +++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 .github/copilot-instructions.md create mode 100644 workflows/examples/test_iframes.json diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000..9cd442d9 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,62 @@ +# Copilot Instructions for workflow-use + +These project-specific instructions help AI coding agents work productively in this repo. Keep responses concise, implement with tools when possible, and follow the repo’s patterns. + +## Architecture (big picture) +- Monorepo with three main parts: + - `extension/` (WXT MV3 Chrome extension): records user actions via rrweb + custom DOM events and streams to a local server. + - `workflows/` (Python package + CLI + FastAPI backend): converts recorded events into deterministic workflow steps, runs/replays steps (fallback to Browser Use/Playwright). + - `ui/` (Vite/React frontend): GUI to view and run workflows. +- Data flow: + 1) Content scripts capture events → background aggregates → posts `WORKFLOW_UPDATE` to `http://127.0.0.1:7331/event`. + 2) Backend stores/processes to `workflows/examples/*.json` and executes with Playwright. + 3) UI/CLI visualize/run. +- Key types: `extension/src/lib/types.ts` (Stored* events), `extension/src/lib/workflow-types.ts` (Step union), `workflows` Python `Workflow` model. + +## Dev workflows +- Build extension: `cd extension && npm install && npm run build`. +- Python backend setup: `cd workflows && uv sync && playwright install chromium && cp .env.example .env`. +- Record: `cd workflows && python cli.py create-workflow` (starts local server; open Chrome with built extension). +- Run workflow as tool: `python cli.py run-as-tool examples/example.workflow.json --prompt "..."`. +- Run workflow: `python cli.py run-workflow examples/example.workflow.json`. +- Launch GUI: `python cli.py launch-gui` (starts FastAPI + UI dev server). + +## Extension patterns +- Use `defineBackground` and `defineContentScript` (WXT). Content script always attaches listeners; background aggregates and emits `WORKFLOW_UPDATE` with a hash to avoid spam. +- Recording: + - rrweb for scroll/meta; custom `CUSTOM_CLICK_EVENT`, `CUSTOM_INPUT_EVENT`, `CUSTOM_KEY_EVENT`, etc. + - New-tab intent: content sends `PREPARE_NEW_TAB`; background correlates `tabs.onCreated` and marks `userInitiated`. + - Activated tab gating: ignore tabs never activated (reduces ad/tracker noise). + - Dedupe: merge consecutive identical steps, collapse rapid empty input bursts, consolidate navigations per tab. + - Iframes: content runs with `allFrames: true` and `matchAboutBlank: true`; events carry `frameUrl` and `frameIdPath`. Background only allows rrweb meta navigations from frames the user interacted with and filters ad/analytics hosts. + +## Backend patterns +- Python FastAPI endpoint `http://127.0.0.1:7331/event` receives: + - `RECORDING_STARTED/STOPPED`, `WORKFLOW_UPDATE` with `steps` only (hash-based dedupe). +- CLI: `workflows/cli.py` provides record/run/launch commands; Playwright is used for replay. +- Keep workflow JSON in `workflows/examples/`. Naming is free-form; version stays at `1.0.0` today. + +## Conventions +- Step schema (extension `workflow-types.ts`): navigation, click, input, key_press, scroll. Prefer merging updates over emitting new steps. +- Use XPath + enhanced CSS selectors; keep values masked for password inputs. +- Avoid sending events from tabs not in `activatedTabs` unless `userInitiated`. +- When adding new event types, extend Stored* in `types.ts`, enrich in content, and map to `Step` in background. + +## Gotchas / Tips +- Avoid noisy iframe navs (recaptcha/ads): rely on `interactedFrameUrls` filtering in background. If adjusting, prefer allow/deny logic over hard-coding hosts in multiple places. +- When changing extension logic, rebuild with `npm run build`; dev opens side panel on install/update. +- Screenshot capture only works for visible tabs; background uses `captureVisibleTab` best-effort. +- If tests are added, ensure they run per package (`extension`, `workflows`, `ui`) rather than at repo root. + +## Example tasks for agents +- Add a new step type (e.g., select): + 1) Extend `StoredCustomSelectEvent` in `types.ts` and emit in `content.ts`. + 2) Map to a `SelectStep` in `background.ts` (convertStoredEventsToSteps). + 3) Update backend replay to handle the new step. +- Reduce noise further: + - Tune debounce windows in content. + - Post-process duplicates in `broadcastWorkflowDataUpdate`. + - Add frame interaction checks before accepting rrweb meta navigations. + +## Security & secrets +- Do not commit real API keys. `.env.example` exists; load secrets locally. If you see a real key in `workflows/.env`, instruct maintainers to rotate and remove it. diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json new file mode 100644 index 00000000..b1834355 --- /dev/null +++ b/workflows/examples/test_iframes.json @@ -0,0 +1,101 @@ +{ + "name": "Recorded Workflow", + "description": "Recorded on 8/11/2025, 4:20:47 AM", + "version": "1.0.0", + "input_schema": [], + "steps": [ + { + "type": "navigation", + "timestamp": 1754866228439, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "scroll", + "timestamp": 1754866228608, + "tabId": 388342781, + "targetId": 219, + "scrollX": 0, + "scrollY": 7, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "navigation", + "timestamp": 1754866228634, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866228849, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "id(\"textareawrapper\")/div[1]/div[6]", + "cssSelector": "div.CodeMirror-scroll", + "elementTag": "DIV", + "elementText": "​

The iframe element

​​" + }, + { + "type": "navigation", + "timestamp": 1754866230495, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866231531, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "body/div[2]/div[1]/a[4]", + "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]", + "elementTag": "A", + "elementText": "" + }, + { + "type": "navigation", + "timestamp": 1754866237707, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866238574, + "tabId": 388342781, + "url": "https://www.w3schools.com/", + "frameUrl": "https://www.w3schools.com/", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[3]", + "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]", + "elementTag": "A", + "elementText": "JAVASCRIPT" + }, + { + "type": "navigation", + "timestamp": 1754866242778, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866242882, + "tabId": 388342781, + "url": "https://www.w3schools.com/js/default.asp", + "frameUrl": "https://www.w3schools.com/js/default.asp", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[1]", + "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]", + "elementTag": "A", + "elementText": "HTML" + }, + { + "type": "navigation", + "timestamp": 1754866246395, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + } + ] +} \ No newline at end of file From 0e46a9f234bdaea02edd2ada1584f7a4f2b412be Mon Sep 17 00:00:00 2001 From: Diveyam Mishra Date: Sat, 25 Oct 2025 23:07:28 +0530 Subject: [PATCH 08/13] feat(workflow): strengthen iframe-aware execution and extraction --- workflows/examples/test_iframes.json | 101 ++++ workflows/workflow_use/controller/service.py | 131 ++++- workflows/workflow_use/controller/views.py | 7 +- workflows/workflow_use/recorder/service.py | 37 +- workflows/workflow_use/workflow/service.py | 538 ++++++++----------- 5 files changed, 457 insertions(+), 357 deletions(-) create mode 100644 workflows/examples/test_iframes.json diff --git a/workflows/examples/test_iframes.json b/workflows/examples/test_iframes.json new file mode 100644 index 00000000..b1834355 --- /dev/null +++ b/workflows/examples/test_iframes.json @@ -0,0 +1,101 @@ +{ + "name": "Recorded Workflow", + "description": "Recorded on 8/11/2025, 4:20:47 AM", + "version": "1.0.0", + "input_schema": [], + "steps": [ + { + "type": "navigation", + "timestamp": 1754866228439, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "scroll", + "timestamp": 1754866228608, + "tabId": 388342781, + "targetId": 219, + "scrollX": 0, + "scrollY": 7, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "navigation", + "timestamp": 1754866228634, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866228849, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "id(\"textareawrapper\")/div[1]/div[6]", + "cssSelector": "div.CodeMirror-scroll", + "elementTag": "DIV", + "elementText": "​

The iframe element

​​" + }, + { + "type": "navigation", + "timestamp": 1754866230495, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866231531, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameUrl": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe", + "frameIdPath": "0", + "xpath": "body/div[2]/div[1]/a[4]", + "cssSelector": "a.w3-button.w3-bar-item.w3-hide-small.topnav-icons.fa.fa-rotate.ga-tryit[href=\"javascript:void(0);\"][title*=\"Change Orientation\"]", + "elementTag": "A", + "elementText": "" + }, + { + "type": "navigation", + "timestamp": 1754866237707, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866238574, + "tabId": 388342781, + "url": "https://www.w3schools.com/", + "frameUrl": "https://www.w3schools.com/", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[3]", + "cssSelector": "a.ga-nav[href=\"/js/default.asp\"][title*=\"JavaScript Tutorial\"]", + "elementTag": "A", + "elementText": "JAVASCRIPT" + }, + { + "type": "navigation", + "timestamp": 1754866242778, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + }, + { + "type": "click", + "timestamp": 1754866242882, + "tabId": 388342781, + "url": "https://www.w3schools.com/js/default.asp", + "frameUrl": "https://www.w3schools.com/js/default.asp", + "frameIdPath": "0.0", + "xpath": "id(\"subtopnav\")/a[1]", + "cssSelector": "a.ga-nav.subtopnav_firstitem[href=\"/html/default.asp\"][title*=\"HTML Tutorial\"]", + "elementTag": "A", + "elementText": "HTML" + }, + { + "type": "navigation", + "timestamp": 1754866246395, + "tabId": 388342781, + "url": "https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe" + } + ] +} \ No newline at end of file diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 4205a952..2465d8cc 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -3,8 +3,9 @@ from browser_use import Browser from browser_use.agent.views import ActionResult -from browser_use.controller import Controller -from browser_use.llm.base import BaseChatModel +from browser_use.controller.service import Controller +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import PromptTemplate from workflow_use.controller.utils import get_best_element_handle, truncate_selector from workflow_use.controller.views import ( @@ -19,7 +20,7 @@ logger = logging.getLogger(__name__) -DEFAULT_ACTION_TIMEOUT_MS = 1000 +DEFAULT_ACTION_TIMEOUT_MS = 2500 # List of default actions from browser_use.controller.service.Controller to disable # todo: come up with a better way to filter out the actions (filter IN the actions would be much nicer in this case) @@ -65,9 +66,7 @@ async def navigation(params: NavigationAction, browser_session: Browser) -> Acti """Navigate to the given URL.""" page = await browser_session.get_current_page() await page.goto(params.url) - # Wait for page to load (CDP navigate doesn't wait automatically) - import asyncio - await asyncio.sleep(2) + await page.wait_for_load_state() msg = f'🔗 Navigated to URL: {params.url}' logger.info(msg) @@ -84,16 +83,101 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse page = await browser_session.get_current_page() original_selector = params.cssSelector + # If frameUrl or frameIdPath are provided, narrow the search to that frame + def _select_context(pg): + try: + from playwright.async_api import Page, Frame + ctx: Page | Frame = pg + # If frame hints point to top document, stay on page + fid = getattr(params, 'frameIdPath', None) + furl = getattr(params, 'frameUrl', None) + curr_url = (pg.url or '').split('#')[0] if hasattr(pg, 'url') else '' + if furl and curr_url and furl.split('#')[0] == curr_url: + return pg + if fid: + segs = [s for s in str(fid).split('.') if s != ''] + if all(s == '0' for s in segs): + return pg + f = pg.main_frame + for s in segs[1:]: # skip top marker + idx = int(s) + if 0 <= idx < len(f.child_frames): + f = f.child_frames[idx] + else: + return pg + return f + if furl: + from urllib.parse import urlparse + pf = urlparse(furl) + # If frameUrl equals current page URL (origin+path), stay on page + try: + from urllib.parse import urlparse as _u + cu = _u(curr_url) + if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): + return pg + except Exception: + pass + for fr in pg.frames: + try: + ff = urlparse(fr.url) + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): + return fr + except Exception: + continue + except Exception: + return pg + return ctx + + # Fallback: search all frames for selector (prefer frames matching target origin) + async def _find_in_frames(pg, selector: str): + from urllib.parse import urlparse + prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' + pref_o = urlparse(prefer) if prefer else None + frames = list(pg.frames) + def score(fr): + if not pref_o: + return 0 + try: + fo = urlparse(fr.url) + return 2 if (fo.scheme, fo.netloc) == (pref_o.scheme, pref_o.netloc) else 0 + except Exception: + return 0 + frames.sort(key=score, reverse=True) + for fr in frames: + try: + loc, used = await get_best_element_handle(fr, selector, params, timeout_ms=max(800, DEFAULT_ACTION_TIMEOUT_MS // 2)) + return fr, loc, used + except Exception: + continue + return None, None, None + try: - locator, selector_used = await get_best_element_handle( - page, - params.cssSelector, - params, - timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, - ) + # Only auto-navigate for top-document clicks (no frame hints) when a different URL is declared + curr = (page.url or '').split('#')[0] + declared_url = (getattr(params, 'url', None) or '').split('#')[0] + has_frame_hints = bool(getattr(params, 'frameIdPath', None) or getattr(params, 'frameUrl', None)) + if declared_url and declared_url.startswith('http') and not has_frame_hints and curr != declared_url: + await page.goto(declared_url) + await page.wait_for_load_state() + + ctx = _select_context(page) + try: + locator, selector_used = await get_best_element_handle( + ctx, + params.cssSelector, + params, + timeout_ms=DEFAULT_ACTION_TIMEOUT_MS, + ) + except Exception: + # Fallback: search all frames + fr, locator, selector_used = await _find_in_frames(page, params.cssSelector) + if locator is None: + raise + await locator.click(force=True) - msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(selector_used)} (original: {truncate_selector(original_selector)})' + used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector + msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: @@ -201,7 +285,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult: """Scroll the page by the given x/y pixel offsets.""" page = await browser_session.get_current_page() - await page.evaluate(f'() => window.scrollBy({params.scrollX}, {params.scrollY})') + await page.evaluate(f'window.scrollBy({params.scrollX}, {params.scrollY});') msg = f'📜 Scrolled page by (x={params.scrollX}, y={params.scrollY})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) @@ -220,18 +304,19 @@ async def extract_page_content( strip = ['a', 'img'] - # Get page HTML content using CDP evaluate - html_content = await page.evaluate('() => document.documentElement.outerHTML') - content = markdownify.markdownify(html_content, strip=strip) + content = markdownify.markdownify(await page.content(), strip=strip) - # Note: iframe content extraction is not yet supported in CDP-based implementation - # TODO: Implement iframe content extraction using CDP + # manually append iframe text into the content so it's readable by the LLM (includes cross-origin iframes) + for iframe in page.frames: + if iframe.url != page.url and not iframe.url.startswith('data:'): + content += f'\n\nIFRAME {iframe.url}:\n' + content += markdownify.markdownify(await iframe.content()) - prompt = f'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {params.goal}, Page: {content}' + prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}' + template = PromptTemplate(input_variables=['goal', 'page'], template=prompt) try: - from browser_use.llm import UserMessage - output = await page_extraction_llm.ainvoke([UserMessage(content=prompt)]) - msg = f'📄 Extracted from page\n: {output.completion}\n' + output = await page_extraction_llm.ainvoke(template.format(goal=params.goal, page=content)) + msg = f'📄 Extracted from page\n: {output.content}\n' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) except Exception as e: diff --git a/workflows/workflow_use/controller/views.py b/workflows/workflow_use/controller/views.py index 1df0900c..ffb25b4a 100644 --- a/workflows/workflow_use/controller/views.py +++ b/workflows/workflow_use/controller/views.py @@ -13,9 +13,8 @@ class Config: # Mixin for shared step metadata (timestamp and tab context) class StepMeta(_BaseExtra): - # timestamp: int - # tabId: int - pass + timestamp: int + tabId: int # Common optional fields present in recorder events @@ -24,6 +23,8 @@ class RecorderBase(StepMeta): elementTag: Optional[str] = None elementText: Optional[str] = None frameUrl: Optional[str] = None + frameIdPath: Optional[str] = None + url: Optional[str] = None screenshot: Optional[str] = None diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py index 756ca6bb..5de21556 100644 --- a/workflows/workflow_use/recorder/service.py +++ b/workflows/workflow_use/recorder/service.py @@ -7,6 +7,7 @@ from browser_use import Browser from browser_use.browser.profile import BrowserProfile from fastapi import FastAPI +from patchright.async_api import async_playwright as patchright_async_playwright # Assuming views.py is correctly located for this import path from workflow_use.recorder.views import ( @@ -84,7 +85,32 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): async with self.final_workflow_processed_lock: if not self.final_workflow_processed_flag and self.last_workflow_update_event: print(f'[Service] Capturing final workflow (Trigger: {trigger_reason}).') - self.final_workflow_output = self.last_workflow_update_event.payload + wf = self.last_workflow_update_event.payload + # Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations + try: + clean_steps = [] + for s in wf.steps: + st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None) + url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None) + if st == 'navigation': + if not url or url == 'about:blank': + continue + from urllib.parse import urlparse + host = urlparse(url).hostname or '' + blocked = any( + pat in host for pat in ( + 'doubleclick.net', 'googlesyndication.com', 'googleadservices.com', + 'amazon-adsystem.com', '2mdn.net', 'recaptcha.google.com', 'recaptcha.net', + 'googletagmanager.com', 'indexww.com', 'adtrafficquality.google' + ) + ) + if blocked: + continue + clean_steps.append(s) + wf.steps = clean_steps + except Exception as e: + print(f'[Service] Backend filter failed: {e}') + self.final_workflow_output = wf self.final_workflow_processed_flag = True processed_this_call = True @@ -96,7 +122,7 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): if trigger_reason == 'RecordingStoppedEvent' and self.browser: print('[Service] Attempting to close browser due to RecordingStoppedEvent...') try: - await self.browser.stop() + await self.browser.close() print('[Service] Browser close command issued.') except Exception as e_close: print(f'[Service] Error closing browser on recording stop: {e_close}') @@ -127,7 +153,8 @@ async def _launch_browser_and_wait(self): ) # Create and configure browser - self.browser = Browser(browser_profile=profile) + playwright = await patchright_async_playwright().start() + self.browser = Browser(browser_profile=profile, playwright=playwright) print('[Service] Starting browser with extensions...') await self.browser.start() @@ -150,7 +177,7 @@ async def _launch_browser_and_wait(self): print('[Service] Browser task cancelled.') if self.browser: try: - await self.browser.stop() + await self.browser.close() except: pass # Best effort raise # Re-raise to be caught by gather @@ -218,7 +245,7 @@ async def capture_workflow(self) -> Optional[WorkflowDefinitionSchema]: print('[Service] Ensuring browser is closed in cleanup...') try: self.browser.browser_profile.keep_alive = False - await self.browser.stop() + await self.browser.close() except Exception as e_browser_close: print(f'[Service] Error closing browser in final cleanup: {e_browser_close}') # self.browser = None diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index e8bedad7..d21da2e8 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -6,25 +6,32 @@ import logging from pathlib import Path from typing import Any, Dict, List, TypeVar -from typing import cast as _cast from browser_use import Agent, Browser from browser_use.agent.views import ActionResult, AgentHistoryList -from browser_use.llm.base import BaseChatModel -from browser_use.llm import SystemMessage, UserMessage -from pydantic import BaseModel, Field, create_model +from langchain.agents import AgentExecutor, create_tool_calling_agent +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.tools import StructuredTool +from pydantic import BaseModel, create_model from workflow_use.controller.service import WorkflowController from workflow_use.controller.utils import get_best_element_handle from workflow_use.schema.views import ( AgenticWorkflowStep, + ClickStep, DeterministicWorkflowStep, + InputStep, + KeyPressStep, + NavigationStep, + ScrollStep, + SelectChangeStep, WorkflowDefinitionSchema, WorkflowInputSchemaDefinition, WorkflowStep, ) -from workflow_use.workflow.prompts import AGENT_STEP_SYSTEM_PROMPT, STRUCTURED_OUTPUT_PROMPT -from workflow_use.workflow.step_agent.controller import WorkflowStepAgentController +from workflow_use.workflow.prompts import STRUCTURED_OUTPUT_PROMPT, WORKFLOW_FALLBACK_PROMPT_TEMPLATE from workflow_use.workflow.views import WorkflowRunOutput logger = logging.getLogger(__name__) @@ -40,13 +47,12 @@ class Workflow: def __init__( self, workflow_schema: WorkflowDefinitionSchema, - llm: BaseChatModel, *, controller: WorkflowController | None = None, browser: Browser | None = None, + llm: BaseChatModel | None = None, page_extraction_llm: BaseChatModel | None = None, fallback_to_agent: bool = True, - use_cloud: bool = False, ) -> None: """Initialize a new Workflow instance from a schema object. @@ -56,16 +62,20 @@ def __init__( browser: Optional Browser instance to use for browser automation llm: Optional language model for fallback agent functionality fallback_to_agent: Whether to fall back to agent-based execution on step failure - use_cloud: Whether to use browser-use cloud browser service instead of local browser Raises: ValueError: If the workflow schema is invalid (though Pydantic handles most). """ self.schema = workflow_schema # Store the schema object + self.name = self.schema.name + self.description = self.schema.description + self.version = self.schema.version + self.steps = self.schema.steps + self.controller = controller or WorkflowController() - self.browser = browser or Browser(use_cloud=use_cloud) + self.browser = browser or Browser() # Hack to not close it after agent kicks in self.browser.browser_profile.keep_alive = True @@ -85,12 +95,11 @@ def __init__( def load_from_file( cls, file_path: str | Path, - llm: BaseChatModel, *, controller: WorkflowController | None = None, browser: Browser | None = None, + llm: BaseChatModel | None = None, page_extraction_llm: BaseChatModel | None = None, - use_cloud: bool = False, ) -> Workflow: """Load a workflow from a file.""" with open(file_path, 'r', encoding='utf-8') as f: @@ -102,7 +111,6 @@ def load_from_file( browser=browser, llm=llm, page_extraction_llm=page_extraction_llm, - use_cloud=use_cloud, ) # --- Runners --- @@ -127,167 +135,125 @@ def truncate_selector(selector: str) -> str: # Determine if this is not the last step, and extract next step's cssSelector if available current_index = step_index - if current_index < len(self.schema.steps) - 1: - next_step = self.schema.steps[current_index + 1] + if current_index < len(self.steps) - 1: + next_step = self.steps[current_index + 1] next_step_resolved = self._resolve_placeholders(next_step) css_selector = getattr(next_step_resolved, 'cssSelector', None) if css_selector: try: + await self.browser._wait_for_stable_network() page = await self.browser.get_current_page() - logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') - locator, selector_used = await get_best_element_handle( - page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT - ) - logger.info(f'Element with selector found: {truncate_selector(selector_used)}') + # If the next step declares a URL/frameUrl and it does not match the current page URL, + # skip waiting for its element on the current page (prevents false failures like step 7). + curr_url = (page.url or '').split('#')[0] + declared_next_url = (getattr(next_step_resolved, 'url', None) or getattr(next_step_resolved, 'frameUrl', None) or '').split('#')[0] + if declared_next_url and declared_next_url != curr_url: + logger.info( + f"Skipping pre-wait for next selector because declared next URL ({truncate_selector(declared_next_url)}) != current URL ({truncate_selector(curr_url)})" + ) + else: + logger.info(f'Waiting for element with selector: {truncate_selector(css_selector)}') + locator, selector_used = await get_best_element_handle( + page, css_selector, next_step_resolved, timeout_ms=WAIT_FOR_ELEMENT_TIMEOUT + ) + logger.info(f'Element with selector found: {truncate_selector(selector_used)}') except Exception as e: logger.error(f'Failed to wait for element with selector: {truncate_selector(css_selector)}. Error: {e}') raise Exception(f'Failed to wait for element. Selector: {css_selector}') from e return result - def _format_agent_step_context(self, current_step: AgenticWorkflowStep, step_index: int) -> str: - """Format the workflow step context for the agent with extended context (last 2, current, next 2 steps).""" - - def format_step_info(step: WorkflowStep, step_num: int) -> str: - """Format step information consistently.""" - info = [f'Step {step_num}: Type: {step.type}'] - if step.description: - info.append(f'Description: {step.description}') - # For agent steps, show the task - if isinstance(step, AgenticWorkflowStep): - info.append(f'Task: {step.task}') - return '\n'.join(info) - - sections = [] - total_steps = len(self.schema.steps) - - # Add previous steps context (last 2 steps) - prev_steps = [] - for i in range(max(0, step_index - 2), step_index): - prev_step = self.schema.steps[i] - prev_steps.append(format_step_info(prev_step, i + 1)) - - if prev_steps: - sections.extend(['=== PREVIOUS STEPS (FOR CONTEXT ONLY) ===', '\n\n'.join(prev_steps), '']) - - # Add current step context - sections.extend(['=== CURRENT STEP (YOUR TASK) ===', format_step_info(current_step, step_index + 1), '']) - - # Add next steps context (next 2 steps) - next_steps = [] - for i in range(step_index + 1, min(total_steps, step_index + 3)): - next_step = self.schema.steps[i] - next_steps.append(format_step_info(next_step, i + 1)) - - if next_steps: - sections.extend( - [ - '=== NEXT STEPS (FOR CONTEXT ONLY) ===', - '\n\n'.join(next_steps), - ] - ) - - return '\n'.join(sections) - - async def _run_agent_step(self, step: AgenticWorkflowStep, step_index: int) -> AgentHistoryList: + async def _run_agent_step(self, step: AgenticWorkflowStep) -> AgentHistoryList: """Spin-up an Agent based on step dictionary.""" - # Create contextual task with extended context (last 2, current, next 2 steps) - contextual_task = self._format_agent_step_context(step, step_index) - - # logger.info(f'Contextual task: {contextual_task}') - - # task = """ - # {step.task} + if self.llm is None: + raise ValueError("An 'llm' instance must be supplied for agent-based steps") - # Please do not make up any fake data. - # """ + task: str = step.task + max_steps: int = step.max_steps or 5 agent = Agent( - task=step.task, # Only the current step task goes into ultimate task - message_context=contextual_task, # Extended context with surrounding steps + task=task, llm=self.llm, browser_session=self.browser, - controller=WorkflowStepAgentController(), - # use_vision=True, # Consider making this configurable via WorkflowStep schema - override_system_message=AGENT_STEP_SYSTEM_PROMPT, + use_vision=True, # Consider making this configurable via WorkflowStep schema + ) + return await agent.run(max_steps=max_steps) + + async def _fallback_to_agent( + self, + step_resolved: WorkflowStep, + step_index: int, + error: Exception | str | None = None, + ) -> AgentHistoryList: + """Handle step failure by delegating to an agent.""" + if self.llm is None: + raise ValueError("Cannot fall back to agent: An 'llm' instance must be supplied") + # print('Workflow steps:', step_resolved) + # Extract details from the failed step dictionary + failed_action_name = step_resolved.type + failed_params = step_resolved.model_dump() + step_description = step_resolved.description or 'No description provided' + error_msg = str(error) if error else 'Unknown error' + total_steps = len(self.steps) + fail_details = ( + f"step={step_index + 1}/{total_steps}, action='{failed_action_name}', " + f"description='{step_description}', params={str(failed_params)}, error='{error_msg}'" ) - return await agent.run() - - # async def _fallback_to_agent( - # self, - # step_resolved: WorkflowStep, - # step_index: int, - # error: Exception | str | None = None, - # ) -> AgentHistoryList: - # """Handle step failure by delegating to an agent.""" - - # # print('Workflow steps:', step_resolved) - # # Extract details from the failed step dictionary - # failed_action_name = step_resolved.type - # failed_params = step_resolved.model_dump() - # step_description = step_resolved.description or 'No description provided' - # error_msg = str(error) if error else 'Unknown error' - # total_steps = len(self.steps) - # fail_details = ( - # f"step={step_index + 1}/{total_steps}, action='{failed_action_name}', " - # f"description='{step_description}', params={str(failed_params)}, error='{error_msg}'" - # ) - - # # Determine the failed_value based on step type and attributes - # failed_value = None - # description_prefix = f'Purpose: {step_description}. ' if step_description else '' - - # if isinstance(step_resolved, NavigationStep): - # failed_value = f'{description_prefix}Navigate to URL: {step_resolved.url}' - # elif isinstance(step_resolved, ClickStep): - # # element_info = step_resolved.elementText or step_resolved.cssSelector - # # failed_value = f"{description_prefix}Click element: {element_info}" - # failed_value = f'Find and click element with description: {step_resolved.description}' - # elif isinstance(step_resolved, InputStep): - # failed_value = f"{description_prefix}Input text: '{step_resolved.value}' into element." - # elif isinstance(step_resolved, SelectChangeStep): - # failed_value = f"{description_prefix}Select option: '{step_resolved.selectedText}' in dropdown." - # elif isinstance(step_resolved, KeyPressStep): - # failed_value = f"{description_prefix}Press key: '{step_resolved.key}'" - # elif isinstance(step_resolved, ScrollStep): - # failed_value = f'{description_prefix}Scroll to position: (x={step_resolved.scrollX}, y={step_resolved.scrollY})' - # else: - # failed_value = f"{description_prefix}No specific target value available for action '{failed_action_name}'" - - # # Build workflow overview using the stored dictionaries - # workflow_overview_lines: list[str] = [] - # for idx, step in enumerate(self.steps): - # desc = step.description or '' - # step_type_info = step.type - # details = step.model_dump() - # workflow_overview_lines.append(f' {idx + 1}. ({step_type_info}) {desc} - {details}') - # workflow_overview = '\n'.join(workflow_overview_lines) - # # print(workflow_overview) - - # # Build the fallback task with the failed_value - # fallback_task = WORKFLOW_FALLBACK_PROMPT_TEMPLATE.format( - # step_index=step_index + 1, - # total_steps=len(self.steps), - # workflow_details=workflow_overview, - # action_type=failed_action_name, - # fail_details=fail_details, - # failed_value=failed_value, - # step_description=step_description, - # ) - # logger.info(f'Agent fallback task: {fallback_task}') - - # # Prepare agent step config based on the failed step, adding task - # agent_step_config = AgenticWorkflowStep( - # type='agent', - # task=fallback_task, - # max_steps=5, - # output=None, - # description='Fallback agent to handle step failure', - # ) - - # return await self._run_agent_step(agent_step_config) + # Determine the failed_value based on step type and attributes + failed_value = None + description_prefix = f'Purpose: {step_description}. ' if step_description else '' + + if isinstance(step_resolved, NavigationStep): + failed_value = f'{description_prefix}Navigate to URL: {step_resolved.url}' + elif isinstance(step_resolved, ClickStep): + # element_info = step_resolved.elementText or step_resolved.cssSelector + # failed_value = f"{description_prefix}Click element: {element_info}" + failed_value = f'Find and click element with description: {step_resolved.description}' + elif isinstance(step_resolved, InputStep): + failed_value = f"{description_prefix}Input text: '{step_resolved.value}' into element." + elif isinstance(step_resolved, SelectChangeStep): + failed_value = f"{description_prefix}Select option: '{step_resolved.selectedText}' in dropdown." + elif isinstance(step_resolved, KeyPressStep): + failed_value = f"{description_prefix}Press key: '{step_resolved.key}'" + elif isinstance(step_resolved, ScrollStep): + failed_value = f'{description_prefix}Scroll to position: (x={step_resolved.scrollX}, y={step_resolved.scrollY})' + else: + failed_value = f"{description_prefix}No specific target value available for action '{failed_action_name}'" + + # Build workflow overview using the stored dictionaries + workflow_overview_lines: list[str] = [] + for idx, step in enumerate(self.steps): + desc = step.description or '' + step_type_info = step.type + details = step.model_dump() + workflow_overview_lines.append(f' {idx + 1}. ({step_type_info}) {desc} - {details}') + workflow_overview = '\n'.join(workflow_overview_lines) + # print(workflow_overview) + + # Build the fallback task with the failed_value + fallback_task = WORKFLOW_FALLBACK_PROMPT_TEMPLATE.format( + step_index=step_index + 1, + total_steps=len(self.steps), + workflow_details=workflow_overview, + action_type=failed_action_name, + fail_details=fail_details, + failed_value=failed_value, + step_description=step_description, + ) + logger.info(f'Agent fallback task: {fallback_task}') + + # Prepare agent step config based on the failed step, adding task + agent_step_config = AgenticWorkflowStep( + type='agent', + task=fallback_task, + max_steps=5, + output=None, + description='Fallback agent to handle step failure', + ) + + return await self._run_agent_step(agent_step_config) def _validate_inputs(self, inputs: dict[str, Any]) -> None: """Validate provided inputs against the workflow's input schema definition.""" @@ -421,35 +387,31 @@ async def _execute_step(self, step_index: int, step_resolved: WorkflowStep) -> A logger.warning( f'Deterministic step {step_index + 1} ({action_name}) failed: {e}. Attempting fallback with agent.' ) - - raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}') - - # if self.fallback_to_agent: - # result = await self._fallback_to_agent(step_resolved, step_index, e) - # if not result.is_successful(): - # raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed even after fallback') - # else: - # raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}') - + if self.llm is None: + raise ValueError('Cannot fall back to agent: LLM instance required.') + if self.fallback_to_agent: + result = await self._fallback_to_agent(step_resolved, step_index, e) + if not result.is_successful(): + raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed even after fallback') + else: + raise ValueError(f'Deterministic step {step_index + 1} ({action_name}) failed: {e}') elif isinstance(step_resolved, AgenticWorkflowStep): # Use task key from step dictionary task_description = step_resolved.task logger.info(f'Running agent task: {task_description}') try: - result = await self._run_agent_step(step_resolved, step_index) + result = await self._run_agent_step(step_resolved) if not result.is_successful(): logger.warning(f'Agent step {step_index + 1} failed evaluation.') raise ValueError(f'Agent step {step_index + 1} failed evaluation.') - except Exception as e: - raise ValueError(f'Agent step {step_index + 1} failed: {e}. (Agent fallback is disabled)') - if self.fallback_to_agent: logger.warning(f'Agent step {step_index + 1} failed: {e}. Attempting fallback with agent.') - - # result = await self._fallback_to_agent(step_resolved, step_index, e) - # if not result.is_successful(): - # raise ValueError(f'Agent step {step_index + 1} failed even after fallback') + if self.llm is None: + raise ValueError('Cannot fall back to agent: LLM instance required.') + result = await self._fallback_to_agent(step_resolved, step_index, e) + if not result.is_successful(): + raise ValueError(f'Agent step {step_index + 1} failed even after fallback') else: raise ValueError(f'Agent step {step_index + 1} failed: {e}') @@ -463,7 +425,7 @@ async def _convert_results_to_output_model( ) -> T: """Convert workflow results to a specified output model. - Filters ActionResults with extracted_content, then uses LLM to parse + Filters ActionResults with extracted_content, then uses LangChain to parse all extracted texts into the structured output model. Args: @@ -476,6 +438,9 @@ async def _convert_results_to_output_model( if not results: raise ValueError('No results to convert') + if self.llm is None: + raise ValueError('LLM is required for structured output conversion') + # Extract all content from ActionResults extracted_contents = [] @@ -496,13 +461,15 @@ async def _convert_results_to_output_model( # Combine all extracted contents combined_text = '\n\n'.join(extracted_contents) - messages = [ - SystemMessage(content=STRUCTURED_OUTPUT_PROMPT), - UserMessage(content=combined_text), + messages: list[BaseMessage] = [ + AIMessage(content=STRUCTURED_OUTPUT_PROMPT), + HumanMessage(content=combined_text), ] - response = await self.llm.ainvoke(messages, output_format=output_model) - return response.completion + chain = self.llm.with_structured_output(output_model) + chain_result: T = await chain.ainvoke(messages) # type: ignore + + return chain_result async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None): """Run a *single* workflow step asynchronously and return its result. @@ -516,8 +483,8 @@ async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None): are validated and injected into :pyattr:`context`. Subsequent calls can omit *inputs* as :pyattr:`context` is already populated. """ - if not (0 <= step_index < len(self.schema.steps)): - raise IndexError(f'step_index {step_index} is out of range for workflow with {len(self.schema.steps)} steps') + if not (0 <= step_index < len(self.steps)): + raise IndexError(f'step_index {step_index} is out of range for workflow with {len(self.steps)} steps') # Initialise/augment context once with the provided inputs if inputs is not None or not self.context: @@ -531,7 +498,7 @@ async def run_step(self, step_index: int, inputs: dict[str, Any] | None = None): self.context.update(runtime_inputs) async with self.browser: - raw_step_cfg = self.schema.steps[step_index] + raw_step_cfg = self.steps[step_index] step_resolved = self._resolve_placeholders(raw_step_cfg) result = await self._execute_step(step_index, step_resolved) # Persist outputs (if declared) for future steps @@ -573,8 +540,9 @@ async def run( await self.browser.start() try: - for step_index, step_dict in enumerate(self.schema.steps): # self.steps now holds dictionaries + for step_index, step_dict in enumerate(self.steps): # self.steps now holds dictionaries await asyncio.sleep(0.1) + await self.browser._wait_for_stable_network() # Check if cancellation was requested if cancel_event and cancel_event.is_set(): @@ -583,7 +551,7 @@ async def run( # Use description from the step dictionary step_description = step_dict.description or 'No description provided' - logger.info(f'--- Running Step {step_index + 1}/{len(self.schema.steps)} -- {step_description} ---') + logger.info(f'--- Running Step {step_index + 1}/{len(self.steps)} -- {step_description} ---') # Resolve placeholders using the current context (works on the dictionary) step_resolved = self._resolve_placeholders(step_dict) @@ -604,67 +572,39 @@ async def run( # Clean-up browser after finishing workflow if close_browser_at_end: self.browser.browser_profile.keep_alive = False - await self.browser.stop() + await self.browser.close() return WorkflowRunOutput(step_results=results, output_model=output_model_result) # ------------------------------------------------------------------ - # LLM tool wrapper + # LangChain tool wrapper # ------------------------------------------------------------------ def _build_input_model(self) -> type[BaseModel]: - """Return a *pydantic* model matching the workflow's ``input_schema`` section. - - This creates a dynamic Pydantic model that includes format information in field - descriptions, making format requirements visible to LLMs when workflows are used as tools. - """ - + """Return a *pydantic* model matching the workflow's ``input_schema`` section.""" if not self.inputs_def: # No declared inputs -> generate an empty model # Use schema name for uniqueness, fallback if needed model_name = f'{(self.schema.name or "Workflow").replace(" ", "_")}_NoInputs' return create_model(model_name) - # Map workflow input types to Python types type_mapping = { 'string': str, 'number': float, - 'bool': bool, + 'bool': bool, # Added boolean type } - - # Build fields dictionary for create_model() fields: Dict[str, tuple[type, Any]] = {} - for input_def in self.inputs_def: name = input_def.name type_str = input_def.type py_type = type_mapping.get(type_str) - if py_type is None: raise ValueError(f'Unsupported input type: {type_str!r} for field {name!r}') - - # Create field description with format information if available - # This helps LLMs understand expected input formats when workflow is used as a tool - field_description = None - if hasattr(input_def, 'format') and input_def.format: - field_description = f'Format: {input_def.format}' - - # Build field tuple: (type, default_or_field_info) # Pydantic's create_model uses ... (Ellipsis) to mark required fields - if input_def.required: - if field_description: - # Required field with format description - fields[name] = (py_type, Field(..., description=field_description)) - else: - # Required field without format description - fields[name] = (py_type, ...) - else: - if field_description: - # Optional field with format description - fields[name] = (py_type, Field(None, description=field_description)) - else: - # Optional field without format description - fields[name] = (py_type, None) + default = ... if input_def.required else None + fields[name] = (py_type, default) + + from typing import cast as _cast # The raw ``create_model`` helper from Pydantic deliberately uses *dynamic* # signatures, which the static type checker cannot easily verify. We cast @@ -674,119 +614,65 @@ def _build_input_model(self) -> type[BaseModel]: **_cast(Dict[str, Any], fields), ) - async def run_as_tool(self, prompt: str) -> str: - """Run the workflow with inputs parsed from a natural language prompt. - - Args: - prompt: Natural language description of the task and inputs + def as_tool(self, *, name: str | None = None, description: str | None = None): # noqa: D401 + """Expose the entire workflow as a LangChain *StructuredTool* instance. - Returns: - JSON string with workflow results + The generated tool validates its arguments against the workflow's input + schema (if present) and then returns the JSON-serialised output of + :py:meth:`run`. """ - if self.llm is None: - raise ValueError("LLM is required for run_as_tool to parse inputs from prompt") - - # Parse inputs from prompt using LLM - input_model = self._build_input_model() - - system_prompt = f"""You are a helpful assistant that extracts workflow input parameters from user prompts. -The workflow requires the following inputs: -{json.dumps(input_model.model_json_schema(), indent=2)} - -Extract the values from the user's prompt and return them in the required format.""" - - messages = [ - SystemMessage(content=system_prompt), - UserMessage(content=prompt) - ] - - response = await self.llm.ainvoke(messages, output_format=input_model) - inputs = response.completion.model_dump() - - # Run the workflow with parsed inputs - result = await self.run(inputs=inputs, close_browser_at_end=True) - - # Return results as JSON - output = { - "success": True, - "steps_executed": len(result.step_results), - "inputs_used": inputs, - "context": self.context - } - - return json.dumps(output, indent=2) - - async def run_with_no_ai( - self, - inputs: dict[str, Any] | None = None, - close_browser_at_end: bool = True, - cancel_event: asyncio.Event | None = None, - output_model: type[T] | None = None, - ) -> WorkflowRunOutput[T]: - """Execute the workflow using semantic abstraction without any AI/LLM involvement. - This method uses semantic mapping to convert visible text to deterministic selectors, - avoiding expensive LLM calls and fragile CSS selectors. + InputModel = self._build_input_model() + # Use schema name as default, sanitize for tool name requirements + default_name = ''.join(c if c.isalnum() else '_' for c in self.name) + tool_name = name or default_name[:50] + doc = description or self.description # Use schema description + + # `self` is closed over via the inner function so we can keep state. + async def _invoke(**kwargs): # type: ignore[override] + logger.info(f'Running workflow as tool with inputs: {kwargs}') + augmented_inputs = kwargs.copy() if kwargs else {} + for input_def in self.inputs_def: + if not input_def.required and input_def.name not in augmented_inputs: + augmented_inputs[input_def.name] = '' + result = await self.run(inputs=augmented_inputs) + # Serialise non-string output so models that expect a string tool + # response still work. + try: + return _json.dumps(result, default=str) + except Exception: + return str(result) - Args: - inputs: Optional dictionary of workflow inputs - close_browser_at_end: Whether to close the browser when done - cancel_event: Optional event to signal cancellation - output_model: Optional Pydantic model class to convert results to + return StructuredTool.from_function( + coroutine=_invoke, + name=tool_name, + description=doc, + args_schema=InputModel, + ) - Returns: - WorkflowRunOutput containing all step results + async def run_as_tool(self, prompt: str) -> str: """ - from workflow_use.workflow.semantic_executor import SemanticWorkflowExecutor - - runtime_inputs = inputs or {} - # 1. Validate inputs against definition - self._validate_inputs(runtime_inputs) - # 2. Initialize context with validated inputs - self.context = runtime_inputs.copy() # Start with a fresh context - - results: List[ActionResult | AgentHistoryList] = [] - - await self.browser.start() - semantic_executor = SemanticWorkflowExecutor(self.browser, page_extraction_llm=self.page_extraction_llm) - - try: - for step_index, step_dict in enumerate(self.schema.steps): - await asyncio.sleep(0.1) + Run the workflow with a prompt and automatically parse the required variables. - # Check if cancellation was requested - if cancel_event and cancel_event.is_set(): - logger.info('Cancellation requested - stopping workflow execution') - break - - # Use description from the step dictionary - step_description = step_dict.description or 'No description provided' - logger.info(f'--- Running Step {step_index + 1}/{len(self.schema.steps)} -- {step_description} ---') - - # Resolve placeholders using the current context (works on the dictionary) - step_resolved = self._resolve_placeholders(step_dict) - - # Only process deterministic steps (no agent steps) - if step_resolved.type == 'agent': - raise Exception(f"Agent steps are not supported in run_with_no_ai mode. Step {step_index + 1} is an agent step.") - - # Execute step using semantic executor - result = await semantic_executor.execute_step(step_resolved) - - results.append(result) - # Persist outputs using the resolved step dictionary - self._store_output(step_resolved, result) - logger.info(f'--- Finished Step {step_index + 1} ---\n') - - # Convert results to output model if requested - output_model_result: T | None = None - if output_model: - output_model_result = await self._convert_results_to_output_model(results, output_model) + @dev Uses AgentExecutor to properly handle the tool invocation loop. + """ - finally: - # Clean-up browser after finishing workflow - if close_browser_at_end: - self.browser.browser_profile.keep_alive = False - await self.browser.stop() + # For now I kept it simple but one could think of using a react agent here. + if self.llm is None: + raise ValueError("Cannot run as tool: An 'llm' instance must be supplied for tool-based steps") + + prompt_template = ChatPromptTemplate.from_messages( + [ + ('system', 'You are a helpful assistant'), + ('human', '{input}'), + # Placeholders fill up a **list** of messages + ('placeholder', '{agent_scratchpad}'), + ] + ) - return WorkflowRunOutput(step_results=results, output_model=output_model_result) + # Create the workflow tool + workflow_tool = self.as_tool() + agent = create_tool_calling_agent(self.llm, [workflow_tool], prompt_template) + agent_executor = AgentExecutor(agent=agent, tools=[workflow_tool]) + result = await agent_executor.ainvoke({'input': prompt}) + return result['output'] From 015fb5c3b1cb6bf1b47c2ae063ae68b25a11fc1c Mon Sep 17 00:00:00 2001 From: Diveyam Mishra Date: Sat, 25 Oct 2025 23:37:14 +0530 Subject: [PATCH 09/13] chore: address review feedback --- workflows/workflow_use/controller/service.py | 34 +++++++++----------- workflows/workflow_use/recorder/service.py | 11 +++++-- workflows/workflow_use/workflow/service.py | 4 +-- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 2465d8cc..505ef6b6 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -6,6 +6,7 @@ from browser_use.controller.service import Controller from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.prompts import PromptTemplate +from urllib.parse import urlparse from workflow_use.controller.utils import get_best_element_handle, truncate_selector from workflow_use.controller.views import ( @@ -107,30 +108,27 @@ def _select_context(pg): return pg return f if furl: - from urllib.parse import urlparse - pf = urlparse(furl) - # If frameUrl equals current page URL (origin+path), stay on page + pf = urlparse(furl) + # If frameUrl equals current page URL (origin+path), stay on page + try: + cu = urlparse(curr_url) + if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): + return pg + except Exception: + pass + for fr in pg.frames: try: - from urllib.parse import urlparse as _u - cu = _u(curr_url) - if (cu.scheme, cu.netloc, cu.path) == (pf.scheme, pf.netloc, pf.path): - return pg + ff = urlparse(fr.url) + if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): + return fr except Exception: - pass - for fr in pg.frames: - try: - ff = urlparse(fr.url) - if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): - return fr - except Exception: - continue + continue except Exception: return pg return ctx # Fallback: search all frames for selector (prefer frames matching target origin) async def _find_in_frames(pg, selector: str): - from urllib.parse import urlparse prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' pref_o = urlparse(prefer) if prefer else None frames = list(pg.frames) @@ -176,7 +174,7 @@ def score(fr): await locator.click(force=True) - used_str = selector_used if isinstance(selector_used, str) and selector_used else params.cssSelector + used_str = selector_used if isinstance(selector_used, str) else params.cssSelector msg = f'🖱️ Clicked element with CSS selector: {truncate_selector(used_str)} (original: {truncate_selector(original_selector)})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) @@ -285,7 +283,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult: """Scroll the page by the given x/y pixel offsets.""" page = await browser_session.get_current_page() - await page.evaluate(f'window.scrollBy({params.scrollX}, {params.scrollY});') + await page.evaluate('(x, y) => window.scrollBy(x, y)', params.scrollX, params.scrollY) msg = f'📜 Scrolled page by (x={params.scrollX}, y={params.scrollY})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) diff --git a/workflows/workflow_use/recorder/service.py b/workflows/workflow_use/recorder/service.py index 5de21556..4c2f217f 100644 --- a/workflows/workflow_use/recorder/service.py +++ b/workflows/workflow_use/recorder/service.py @@ -2,6 +2,7 @@ import json import pathlib from typing import Optional +from urllib.parse import urlparse import uvicorn from browser_use import Browser @@ -88,14 +89,18 @@ async def _capture_and_signal_final_workflow(self, trigger_reason: str): wf = self.last_workflow_update_event.payload # Backend safety filter: drop about:blank and obvious ad/analytics iframe navigations try: + def _step_field(step, field: str): + if isinstance(step, dict): + return step.get(field) + return getattr(step, field, None) + clean_steps = [] for s in wf.steps: - st = getattr(s, 'type', None) or (s.get('type') if isinstance(s, dict) else None) - url = getattr(s, 'url', None) or (s.get('url') if isinstance(s, dict) else None) + st = _step_field(s, 'type') + url = _step_field(s, 'url') if st == 'navigation': if not url or url == 'about:blank': continue - from urllib.parse import urlparse host = urlparse(url).hostname or '' blocked = any( pat in host for pat in ( diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index d21da2e8..4b7c1ca4 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -11,7 +11,7 @@ from browser_use.agent.views import ActionResult, AgentHistoryList from langchain.agents import AgentExecutor, create_tool_calling_agent from langchain_core.language_models.chat_models import BaseChatModel -from langchain_core.messages import AIMessage, BaseMessage, HumanMessage +from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage from langchain_core.prompts import ChatPromptTemplate from langchain_core.tools import StructuredTool from pydantic import BaseModel, create_model @@ -462,7 +462,7 @@ async def _convert_results_to_output_model( combined_text = '\n\n'.join(extracted_contents) messages: list[BaseMessage] = [ - AIMessage(content=STRUCTURED_OUTPUT_PROMPT), + SystemMessage(content=STRUCTURED_OUTPUT_PROMPT), HumanMessage(content=combined_text), ] From 5bfd65ba7b30e461c9a3df4f0f422697deca94dc Mon Sep 17 00:00:00 2001 From: Diveyam Mishra Date: Sat, 25 Oct 2025 23:51:25 +0530 Subject: [PATCH 10/13] fix: default optional workflow inputs to None --- workflows/workflow_use/workflow/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/workflow_use/workflow/service.py b/workflows/workflow_use/workflow/service.py index 4b7c1ca4..176709dc 100644 --- a/workflows/workflow_use/workflow/service.py +++ b/workflows/workflow_use/workflow/service.py @@ -634,7 +634,7 @@ async def _invoke(**kwargs): # type: ignore[override] augmented_inputs = kwargs.copy() if kwargs else {} for input_def in self.inputs_def: if not input_def.required and input_def.name not in augmented_inputs: - augmented_inputs[input_def.name] = '' + augmented_inputs[input_def.name] = None result = await self.run(inputs=augmented_inputs) # Serialise non-string output so models that expect a string tool # response still work. From 207b64622f1a8a1477aa07e5e042fca7f837ad46 Mon Sep 17 00:00:00 2001 From: Diveyam Mishra Date: Sun, 26 Oct 2025 02:26:00 +0530 Subject: [PATCH 11/13] refactor(controller): avoid direct playwright import --- workflows/workflow_use/controller/service.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 505ef6b6..5e8fa56f 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -87,8 +87,7 @@ async def click(params: ClickElementDeterministicAction, browser_session: Browse # If frameUrl or frameIdPath are provided, narrow the search to that frame def _select_context(pg): try: - from playwright.async_api import Page, Frame - ctx: Page | Frame = pg + ctx = pg # If frame hints point to top document, stay on page fid = getattr(params, 'frameIdPath', None) furl = getattr(params, 'frameUrl', None) @@ -99,7 +98,9 @@ def _select_context(pg): segs = [s for s in str(fid).split('.') if s != ''] if all(s == '0' for s in segs): return pg - f = pg.main_frame + f = getattr(pg, 'main_frame', None) + if not f: + return pg for s in segs[1:]: # skip top marker idx = int(s) if 0 <= idx < len(f.child_frames): @@ -116,7 +117,7 @@ def _select_context(pg): return pg except Exception: pass - for fr in pg.frames: + for fr in getattr(pg, 'frames', []): try: ff = urlparse(fr.url) if (ff.scheme, ff.netloc) == (pf.scheme, pf.netloc) and fr.url.startswith(furl): @@ -131,7 +132,7 @@ def _select_context(pg): async def _find_in_frames(pg, selector: str): prefer = getattr(params, 'frameUrl', None) or getattr(params, 'url', None) or '' pref_o = urlparse(prefer) if prefer else None - frames = list(pg.frames) + frames = list(getattr(pg, 'frames', [])) def score(fr): if not pref_o: return 0 From 7c4a0cc583291c10b37cbaed16632172ae2e00cd Mon Sep 17 00:00:00 2001 From: Diveyam Mishra <118588477+Diveyam-Mishra@users.noreply.github.com> Date: Mon, 27 Oct 2025 01:24:05 +0530 Subject: [PATCH 12/13] Update workflows/workflow_use/controller/service.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- workflows/workflow_use/controller/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 470fe804..39ee6f74 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -184,7 +184,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse @self.registry.action('Scroll page', param_model=ScrollDeterministicAction) async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult: page = await browser_session.must_get_current_page() - await page.evaluate('(x, y) => { window.scrollBy(x, y); return ""; }', params.scrollX, params.scrollY) + await page.evaluate('(x, y) => { window.scrollBy(x, y); }', params.scrollX, params.scrollY) msg = f'Scrolled page by (x={params.scrollX}, y={params.scrollY})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True) From 1c377b34e527a5fbe671a150ad18818cdaf380c9 Mon Sep 17 00:00:00 2001 From: Diveyam Mishra <118588477+Diveyam-Mishra@users.noreply.github.com> Date: Mon, 27 Oct 2025 01:35:54 +0530 Subject: [PATCH 13/13] Update workflows/workflow_use/controller/service.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- workflows/workflow_use/controller/service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/workflow_use/controller/service.py b/workflows/workflow_use/controller/service.py index 39ee6f74..7395d7fd 100644 --- a/workflows/workflow_use/controller/service.py +++ b/workflows/workflow_use/controller/service.py @@ -184,7 +184,7 @@ async def key_press(params: KeyPressDeterministicAction, browser_session: Browse @self.registry.action('Scroll page', param_model=ScrollDeterministicAction) async def scroll(params: ScrollDeterministicAction, browser_session: Browser) -> ActionResult: page = await browser_session.must_get_current_page() - await page.evaluate('(x, y) => { window.scrollBy(x, y); }', params.scrollX, params.scrollY) + await page.evaluate('([x, y]) => { window.scrollBy(x, y); }', [params.scrollX, params.scrollY]) msg = f'Scrolled page by (x={params.scrollX}, y={params.scrollY})' logger.info(msg) return ActionResult(extracted_content=msg, include_in_memory=True)