Integrate OpenHandoff factory workspace (#212)

This commit is contained in:
Nathan Flurry 2026-03-09 14:00:20 -07:00 committed by GitHub
parent 3d9476ed0b
commit bf282199b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
251 changed files with 42824 additions and 692 deletions

View file

@ -1,68 +1,69 @@
#!/bin/bash
# Template: Content Capture Workflow
# Extract content from web pages with optional authentication
# Purpose: Extract content from web pages (text, screenshots, PDF)
# Usage: ./capture-workflow.sh <url> [output-dir]
#
# Outputs:
# - page-full.png: Full page screenshot
# - page-structure.txt: Page element structure with refs
# - page-text.txt: All text content
# - page.pdf: PDF version
#
# Optional: Load auth state for protected pages
set -euo pipefail
TARGET_URL="${1:?Usage: $0 <url> [output-dir]}"
OUTPUT_DIR="${2:-.}"
echo "Capturing content from: $TARGET_URL"
echo "Capturing: $TARGET_URL"
mkdir -p "$OUTPUT_DIR"
# Optional: Load authentication state if needed
# Optional: Load authentication state
# if [[ -f "./auth-state.json" ]]; then
# echo "Loading authentication state..."
# agent-browser state load "./auth-state.json"
# fi
# Navigate to target page
# Navigate to target
agent-browser open "$TARGET_URL"
agent-browser wait --load networkidle
# Get page metadata
echo "Page title: $(agent-browser get title)"
echo "Page URL: $(agent-browser get url)"
# Get metadata
TITLE=$(agent-browser get title)
URL=$(agent-browser get url)
echo "Title: $TITLE"
echo "URL: $URL"
# Capture full page screenshot
agent-browser screenshot --full "$OUTPUT_DIR/page-full.png"
echo "Screenshot saved: $OUTPUT_DIR/page-full.png"
echo "Saved: $OUTPUT_DIR/page-full.png"
# Get page structure
# Get page structure with refs
agent-browser snapshot -i > "$OUTPUT_DIR/page-structure.txt"
echo "Structure saved: $OUTPUT_DIR/page-structure.txt"
echo "Saved: $OUTPUT_DIR/page-structure.txt"
# Extract main content
# Adjust selector based on target site structure
# agent-browser get text @e1 > "$OUTPUT_DIR/main-content.txt"
# Extract specific elements (uncomment as needed)
# agent-browser get text "article" > "$OUTPUT_DIR/article.txt"
# agent-browser get text "main" > "$OUTPUT_DIR/main.txt"
# agent-browser get text ".content" > "$OUTPUT_DIR/content.txt"
# Get full page text
# Extract all text content
agent-browser get text body > "$OUTPUT_DIR/page-text.txt"
echo "Text content saved: $OUTPUT_DIR/page-text.txt"
echo "Saved: $OUTPUT_DIR/page-text.txt"
# Optional: Save as PDF
# Save as PDF
agent-browser pdf "$OUTPUT_DIR/page.pdf"
echo "PDF saved: $OUTPUT_DIR/page.pdf"
echo "Saved: $OUTPUT_DIR/page.pdf"
# Optional: Capture with scrolling for infinite scroll pages
# scroll_and_capture() {
# local count=0
# while [[ $count -lt 5 ]]; do
# agent-browser scroll down 1000
# agent-browser wait 1000
# ((count++))
# done
# agent-browser screenshot --full "$OUTPUT_DIR/page-scrolled.png"
# }
# scroll_and_capture
# Optional: Extract specific elements using refs from structure
# agent-browser get text @e5 > "$OUTPUT_DIR/main-content.txt"
# Optional: Handle infinite scroll pages
# for i in {1..5}; do
# agent-browser scroll down 1000
# agent-browser wait 1000
# done
# agent-browser screenshot --full "$OUTPUT_DIR/page-scrolled.png"
# Cleanup
agent-browser close
echo ""
echo "Capture complete! Files saved to: $OUTPUT_DIR"
echo "Capture complete:"
ls -la "$OUTPUT_DIR"