fix(browse-sh): fetch SKILL.md via /api/skills/{slug}+skillMdUrl

The catalog's sourceUrl points at github.com/browserbase/browse.sh,
whose underlying repository is not always public — most raw URLs derived
from it 404. Use the per-skill detail endpoint instead, which returns a
skillMdUrl CDN blob that reliably resolves to the SKILL.md text. Fall
back to a raw.githubusercontent.com sourceUrl if the detail call fails.

- tools/skills_hub.py: rewrite BrowseShSource.fetch() to resolve via
  /api/skills/{slug} -> skillMdUrl; drop the unreachable _to_raw_url
  helper; expose the resolved URL in bundle.metadata.skill_md_url.
- tests/tools/test_skills_hub_browse_sh.py: match the real catalog
  shape (name = task name, slug = host/task-id), exercise the
  detail-endpoint -> blob two-call flow, and add a fallback test.
- scripts/release.py: map kylejeong21@gmail.com -> Kylejeong2.
This commit is contained in:
teknium1
2026-05-19 14:14:22 -07:00
committed by Teknium
parent 90be1be501
commit 890b2ebd5b
3 changed files with 90 additions and 55 deletions
+1
View File
@@ -104,6 +104,7 @@ AUTHOR_MAP = {
"147827411+EloquentBrush@users.noreply.github.com": "AhmetArif0",
"97489706+purzbeats@users.noreply.github.com": "purzbeats",
"hugosequier@gmail.com": "Hugo-SEQUIER",
"kylejeong21@gmail.com": "Kylejeong2",
"128259593+Gutslabs@users.noreply.github.com": "Gutslabs",
"50326054+nocturnum91@users.noreply.github.com": "nocturnum91",
"52470719+gianfrancopiana@users.noreply.github.com": "gianfrancopiana",
+44 -30
View File
@@ -6,29 +6,31 @@ from unittest.mock import patch
from tools.skills_hub import BrowseShSource, SkillMeta, SkillBundle
# Catalog shape mirrors the real ``GET https://browse.sh/api/skills`` response:
# ``slug`` is ``<hostname>/<task-id>`` and ``name`` is the task name.
SAMPLE_CATALOG = [
{
"slug": "airbnb.com/search-listings-ddgioa",
"name": "airbnb.com",
"name": "search-listings",
"title": "Airbnb Search Listings",
"description": "Search and browse Airbnb listings by location and dates.",
"hostname": "airbnb.com",
"category": "travel",
"tags": ["travel", "accommodation"],
"sourceUrl": "https://github.com/browserbase/browse-sh/blob/main/skills/airbnb.com/SKILL.md",
"sourceUrl": "https://github.com/browserbase/browse.sh/blob/main/skills/airbnb.com/search-listings-ddgioa/SKILL.md",
"recommendedMethod": "stagehand",
"proxies": False,
"installCount": 42,
},
{
"slug": "amazon.com/search-products-xyz",
"name": "amazon.com",
"name": "search-products",
"title": "Amazon Product Search",
"description": "Search for products on Amazon.",
"hostname": "amazon.com",
"category": "shopping",
"tags": ["shopping", "ecommerce"],
"sourceUrl": "https://raw.githubusercontent.com/browserbase/browse-sh/main/skills/amazon.com/SKILL.md",
"sourceUrl": "https://github.com/browserbase/browse.sh/blob/main/skills/amazon.com/search-products-xyz/SKILL.md",
"recommendedMethod": "stagehand",
"proxies": False,
"installCount": 99,
@@ -60,7 +62,7 @@ class TestBrowseShSource(unittest.TestCase):
self.assertGreaterEqual(len(results), 1)
meta = results[0]
self.assertIsInstance(meta, SkillMeta)
self.assertEqual(meta.name, "airbnb.com")
self.assertEqual(meta.name, "search-listings")
self.assertEqual(meta.source, "browse-sh")
self.assertEqual(meta.trust_level, "community")
self.assertEqual(meta.identifier, "browse-sh/airbnb.com/search-listings-ddgioa")
@@ -70,7 +72,7 @@ class TestBrowseShSource(unittest.TestCase):
def test_search_filters_by_query(self, _mock_catalog):
results = self.src.search("amazon", limit=10)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].name, "amazon.com")
self.assertEqual(results[0].extra["hostname"], "amazon.com")
results_all = self.src.search("", limit=10)
self.assertEqual(len(results_all), 2)
@@ -78,22 +80,50 @@ class TestBrowseShSource(unittest.TestCase):
@patch("tools.skills_hub.httpx.get")
@patch.object(BrowseShSource, "_fetch_catalog", return_value=SAMPLE_CATALOG)
def test_fetch_returns_bundle(self, _mock_catalog, mock_get):
mock_get.return_value = _MockResponse(
status_code=200,
text="# Airbnb Skill\n\nSearch and book Airbnb listings.",
# First call: GET /api/skills/{slug} returns the detail object with skillMdUrl.
# Second call: GET the CDN blob URL returns the SKILL.md text.
blob_url = (
"https://gh0lfhlmyzhg6tww.public.blob.vercel-storage.com"
"/skills/airbnb.com/search-listings-ddgioa/SKILL.md"
)
mock_get.side_effect = [
_MockResponse(status_code=200, json_data={"skillMdUrl": blob_url}),
_MockResponse(status_code=200, text="# Airbnb Skill\n\nSearch and book Airbnb listings."),
]
bundle = self.src.fetch("browse-sh/airbnb.com/search-listings-ddgioa")
self.assertIsNotNone(bundle)
self.assertIsInstance(bundle, SkillBundle)
self.assertEqual(bundle.name, "airbnb.com")
self.assertEqual(bundle.name, "search-listings")
self.assertIn("SKILL.md", bundle.files)
self.assertIn("Airbnb", bundle.files["SKILL.md"])
self.assertEqual(bundle.source, "browse-sh")
self.assertEqual(bundle.trust_level, "community")
self.assertEqual(bundle.identifier, "browse-sh/airbnb.com/search-listings-ddgioa")
mock_get.assert_called_once()
call_url = mock_get.call_args.args[0]
self.assertIn("raw.githubusercontent.com", call_url)
self.assertEqual(bundle.metadata["skill_md_url"], blob_url)
# Two HTTP calls: detail endpoint + blob.
self.assertEqual(mock_get.call_count, 2)
first_url = mock_get.call_args_list[0].args[0]
second_url = mock_get.call_args_list[1].args[0]
self.assertIn("/api/skills/airbnb.com/search-listings-ddgioa", first_url)
self.assertEqual(second_url, blob_url)
@patch("tools.skills_hub.httpx.get")
@patch.object(BrowseShSource, "_fetch_catalog", return_value=SAMPLE_CATALOG)
def test_fetch_falls_back_to_raw_github_url(self, _mock_catalog, mock_get):
# Detail endpoint fails → fall back to a raw.githubusercontent.com sourceUrl.
raw_catalog = [dict(SAMPLE_CATALOG[0])]
raw_catalog[0]["sourceUrl"] = (
"https://raw.githubusercontent.com/example/repo/main/skills/"
"airbnb.com/search-listings-ddgioa/SKILL.md"
)
with patch.object(BrowseShSource, "_fetch_catalog", return_value=raw_catalog):
mock_get.side_effect = [
_MockResponse(status_code=500, json_data=None), # detail endpoint fails
_MockResponse(status_code=200, text="# Fallback content"),
]
bundle = self.src.fetch("browse-sh/airbnb.com/search-listings-ddgioa")
self.assertIsNotNone(bundle)
self.assertEqual(bundle.files["SKILL.md"], "# Fallback content")
@patch.object(BrowseShSource, "_fetch_catalog", return_value=SAMPLE_CATALOG)
def test_fetch_missing_slug_returns_none(self, _mock_catalog):
@@ -105,28 +135,12 @@ class TestBrowseShSource(unittest.TestCase):
meta = self.src.inspect("browse-sh/airbnb.com/search-listings-ddgioa")
self.assertIsNotNone(meta)
self.assertIsInstance(meta, SkillMeta)
self.assertEqual(meta.name, "airbnb.com")
self.assertEqual(meta.name, "search-listings")
self.assertEqual(meta.identifier, "browse-sh/airbnb.com/search-listings-ddgioa")
self.assertEqual(meta.extra["hostname"], "airbnb.com")
self.assertEqual(meta.extra["category"], "travel")
self.assertEqual(meta.extra["install_count"], 42)
def test_to_raw_url_conversion(self):
# GitHub HTML URL should be converted
html_url = "https://github.com/browserbase/browse-sh/blob/main/skills/airbnb.com/SKILL.md"
raw_url = self.src._to_raw_url(html_url)
self.assertEqual(
raw_url,
"https://raw.githubusercontent.com/browserbase/browse-sh/main/skills/airbnb.com/SKILL.md",
)
# Already a raw URL — should be returned unchanged
already_raw = "https://raw.githubusercontent.com/browserbase/browse-sh/main/skills/amazon.com/SKILL.md"
self.assertEqual(self.src._to_raw_url(already_raw), already_raw)
# Unrecognised URL — should return None
self.assertIsNone(self.src._to_raw_url("https://example.com/something"))
if __name__ == "__main__":
unittest.main()
+45 -25
View File
@@ -2358,12 +2358,17 @@ class LobeHubSource(SkillSource):
class BrowseShSource(SkillSource):
"""Discover and install site-specific browser automation skills from browse.sh.
browse.sh (https://browse.sh) is Browserbase's catalog of 169+ SKILL.md files
browse.sh (https://browse.sh) is Browserbase's catalog of 200+ SKILL.md files
that describe how to automate specific websites (Airbnb, Amazon, arXiv, etc.).
Each skill has a sourceUrl pointing to the raw SKILL.md on GitHub.
The catalog lives at ``/api/skills`` and each skill's actual SKILL.md content
is fetched via ``/api/skills/{slug}`` which returns a ``skillMdUrl`` field
pointing at a CDN-hosted blob — the catalog's ``sourceUrl`` field is a GitHub
HTML URL whose underlying repository is not always public, so it cannot be
relied on for content fetch.
"""
CATALOG_URL = "https://browse.sh/api/skills"
SKILL_DETAIL_URL = "https://browse.sh/api/skills/{slug}"
_CACHE_KEY = "browse_sh_catalog"
def source_id(self) -> str:
@@ -2454,20 +2459,22 @@ class BrowseShSource(SkillSource):
item = next((i for i in catalog if i.get("slug") == slug), None)
if not item:
return None
source_url = item.get("sourceUrl", "")
if not source_url:
return None
# Convert GitHub HTML URL to raw URL if needed
raw_url = self._to_raw_url(source_url)
if not raw_url:
# Resolve the actual SKILL.md content URL via the per-skill detail
# endpoint, which returns a ``skillMdUrl`` (CDN blob). The catalog's
# ``sourceUrl`` is a GitHub HTML link whose underlying repo is not
# reliably public, so we don't use it for content.
md_url = self._resolve_skill_md_url(slug, item)
if not md_url:
return None
try:
resp = httpx.get(raw_url, timeout=20, follow_redirects=True)
resp = httpx.get(md_url, timeout=20, follow_redirects=True)
if resp.status_code != 200:
return None
content = resp.text
except httpx.HTTPError:
return None
meta = self._item_to_meta(item)
name = meta.name if meta else slug.split("/")[-1]
return SkillBundle(
@@ -2479,31 +2486,44 @@ class BrowseShSource(SkillSource):
metadata={
"slug": slug,
"hostname": item.get("hostname", ""),
"source_url": source_url,
"source_url": item.get("sourceUrl", ""),
"skill_md_url": md_url,
},
)
def _resolve_skill_md_url(self, slug: str, item: Dict) -> Optional[str]:
"""Resolve the SKILL.md content URL for a slug.
Primary path: hit ``/api/skills/{slug}`` and read ``skillMdUrl``.
Fallback: if the catalog item already has a ``raw.githubusercontent.com``
``sourceUrl`` (some entries may), use it directly.
"""
try:
detail = httpx.get(
self.SKILL_DETAIL_URL.format(slug=slug),
timeout=20,
follow_redirects=True,
)
if detail.status_code == 200:
data = detail.json()
if isinstance(data, dict):
md_url = data.get("skillMdUrl")
if isinstance(md_url, str) and md_url.startswith("http"):
return md_url
except (httpx.HTTPError, json.JSONDecodeError):
pass
source_url = item.get("sourceUrl", "") if isinstance(item, dict) else ""
if source_url and "raw.githubusercontent.com" in source_url:
return source_url
return None
def _slug_from_identifier(self, identifier: str) -> str:
"""Extract slug from identifier like 'browse-sh/airbnb.com/search-listings-abc'."""
if identifier.startswith("browse-sh/"):
return identifier[len("browse-sh/"):]
return identifier
def _to_raw_url(self, url: str) -> Optional[str]:
"""Convert a GitHub HTML URL to a raw.githubusercontent.com URL."""
if "raw.githubusercontent.com" in url:
return url
# https://github.com/owner/repo/blob/branch/path -> raw URL
import re
m = re.match(
r"https://github\.com/([^/]+)/([^/]+)/blob/([^/]+)/(.+)",
url,
)
if m:
owner, repo, branch, path = m.groups()
return f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
return None
# ---------------------------------------------------------------------------
# Official optional skills source adapter