fix: consistent Screenshot → UI state emission order across all agents

- Manager and StatelessManager now emit ScreenshotEvent before
  RecordUIStateEvent, matching FastAgent/CodeAct ordering
- DroidAgent finalize step now emits RecordUIStateEvent after the
  final screenshot so every screenshot has a paired UI state
This commit is contained in:
johnmalek312
2026-03-11 03:39:38 +11:00
parent d6d59761a6
commit f3df299547
3 changed files with 52 additions and 43 deletions
+10 -1
View File
@@ -1085,7 +1085,7 @@ class DroidAgent(Workflow):
if self.config.logging.debug:
logger.error(traceback.format_exc())
# Capture final screenshot (independent of trajectory persistence)
# Capture final screenshot and UI state (independent of trajectory persistence)
vision_any = (
self.config.agent.manager.vision
or self.config.agent.executor.vision
@@ -1111,6 +1111,15 @@ class DroidAgent(Workflow):
except Exception as e:
logger.warning(f"Failed to capture final screenshot: {e}")
try:
ui_state = await self.state_provider.get_state()
ctx.write_event_to_stream(
RecordUIStateEvent(ui_state=ui_state.elements)
)
logger.debug("📋 Final UI state captured")
except Exception as e:
logger.warning(f"Failed to capture final UI state: {e}")
# Save trajectory to disk
if self.config.logging.save_trajectory != "none":
# Populate macro data from RecordingDriver log
+24 -24
View File
@@ -387,6 +387,30 @@ class ManagerAgent(Workflow):
"""Gather context and prepare manager prompt."""
logger.debug("💬 Preparing manager context...")
# Capture screenshot if needed
screenshot = None
if self.vision or self._stream_screenshots or self.save_trajectory != "none":
try:
screenshot = await self.action_ctx.driver.screenshot()
if screenshot:
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
parent_span = trace.get_current_span()
record_langfuse_screenshot(
screenshot,
parent_span=parent_span,
screenshots_enabled=bool(
self.tracing_config
and self.tracing_config.langfuse_screenshots
),
vision_enabled=self.vision,
)
logger.debug("📸 Screenshot captured for Manager")
except DeviceDisconnectedError:
raise
except Exception as e:
logger.warning(f"Failed to capture screenshot: {e}")
# Get and format device state
ui_state = await self.state_provider.get_state()
self.action_ctx.ui = ui_state
@@ -422,30 +446,6 @@ class ManagerAgent(Workflow):
else:
self.shared_state.app_card = ""
# Capture screenshot if needed
screenshot = None
if self.vision or self._stream_screenshots or self.save_trajectory != "none":
try:
screenshot = await self.action_ctx.driver.screenshot()
if screenshot:
ctx.write_event_to_stream(ScreenshotEvent(screenshot=screenshot))
parent_span = trace.get_current_span()
record_langfuse_screenshot(
screenshot,
parent_span=parent_span,
screenshots_enabled=bool(
self.tracing_config
and self.tracing_config.langfuse_screenshots
),
vision_enabled=self.vision,
)
logger.debug("📸 Screenshot captured for Manager")
except DeviceDisconnectedError:
raise
except Exception as e:
logger.warning(f"Failed to capture screenshot: {e}")
# Detect text manipulation mode
focused_text_clean = self.shared_state.focused_text.replace("'", "").strip()
has_text_to_modify = focused_text_clean != ""
@@ -175,24 +175,6 @@ class StatelessManagerAgent(Workflow):
async def prepare_context(
self, ctx: Context, ev: StartEvent
) -> ManagerContextEvent:
ui_state = await self.state_provider.get_state()
self.action_ctx.ui = ui_state
self.shared_state.previous_formatted_device_state = (
self.shared_state.formatted_device_state
)
self.shared_state.formatted_device_state = ui_state.formatted_text
self.shared_state.focused_text = ui_state.focused_text
self.shared_state.a11y_tree = ui_state.elements
self.shared_state.phone_state = ui_state.phone_state
self.shared_state.update_current_app(
package_name=ui_state.phone_state.get("packageName", "Unknown"),
activity_name=ui_state.phone_state.get("currentApp", "Unknown"),
)
ctx.write_event_to_stream(RecordUIStateEvent(ui_state=ui_state.elements))
screenshot = None
if self.vision or self.save_trajectory != "none":
try:
@@ -215,6 +197,24 @@ class StatelessManagerAgent(Workflow):
except Exception as e:
logger.warning(f"Failed to capture screenshot: {e}")
ui_state = await self.state_provider.get_state()
self.action_ctx.ui = ui_state
self.shared_state.previous_formatted_device_state = (
self.shared_state.formatted_device_state
)
self.shared_state.formatted_device_state = ui_state.formatted_text
self.shared_state.focused_text = ui_state.focused_text
self.shared_state.a11y_tree = ui_state.elements
self.shared_state.phone_state = ui_state.phone_state
self.shared_state.update_current_app(
package_name=ui_state.phone_state.get("packageName", "Unknown"),
activity_name=ui_state.phone_state.get("currentApp", "Unknown"),
)
ctx.write_event_to_stream(RecordUIStateEvent(ui_state=ui_state.elements))
focused_text_clean = self.shared_state.focused_text.replace("'", "").strip()
has_text_to_modify = focused_text_clean != ""