Skip to content

Commit d24766a

Browse files
authored
Merge pull request #74 from rajbos/copilot/modify-curl-command
Fix Puppeteer deprecated API and improve model scraping accuracy with normalized model names
2 parents 6e0b914 + b0c76e8 commit d24766a

4 files changed

Lines changed: 695 additions & 95 deletions

File tree

.github/scripts/scrape-models.sh

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#!/bin/bash
2+
set -e
3+
4+
# Script to scrape GitHub Copilot supported models from documentation
5+
# This script uses Puppeteer to load the page and extract model names
6+
7+
echo "Installing Puppeteer..."
8+
npm install puppeteer
9+
10+
echo "Creating scraper script..."
11+
cat > scrape.js << 'SCRAPE_EOF'
12+
const puppeteer = require('puppeteer');
13+
const fs = require('fs');
14+
15+
(async () => {
16+
try {
17+
const browser = await puppeteer.launch({
18+
headless: 'new',
19+
args: ['--no-sandbox', '--disable-setuid-sandbox']
20+
});
21+
const page = await browser.newPage();
22+
23+
console.error('Navigating to page...');
24+
await page.goto('https://docs.github.com/en/copilot/reference/ai-models/supported-models', {
25+
waitUntil: 'networkidle0',
26+
timeout: 60000
27+
});
28+
29+
console.error('Content loaded, extracting models...');
30+
31+
// Extract model names from the specific section
32+
const models = await page.evaluate(() => {
33+
const modelNames = [];
34+
35+
// Find the "Supported AI models in Copilot" section
36+
const headings = Array.from(document.querySelectorAll('h2, h3'));
37+
const targetHeading = headings.find(h => h.textContent.includes('Supported AI models in Copilot'));
38+
39+
if (!targetHeading) {
40+
console.error('ERROR: Could not find "Supported AI models in Copilot" heading');
41+
return [];
42+
}
43+
44+
console.error('Found target heading:', targetHeading.textContent);
45+
46+
// Get the content section that contains this heading
47+
let contentSection = targetHeading.closest('div[class*="content"]') || targetHeading.parentElement;
48+
console.error('Content section found:', contentSection ? 'yes' : 'no');
49+
50+
// Find all tables within this section (or after the heading)
51+
let tables = [];
52+
let currentElement = targetHeading.nextElementSibling;
53+
54+
// Traverse siblings until we hit another h2 or run out of elements
55+
while (currentElement) {
56+
if (currentElement.tagName === 'H2') {
57+
break; // Stop at the next major section
58+
}
59+
60+
if (currentElement.tagName === 'TABLE') {
61+
tables.push(currentElement);
62+
} else if (currentElement.querySelectorAll) {
63+
// Check for tables within this element
64+
const nestedTables = currentElement.querySelectorAll('table');
65+
tables.push(...nestedTables);
66+
}
67+
68+
currentElement = currentElement.nextElementSibling;
69+
}
70+
71+
console.error(`Found ${tables.length} tables in the target section`);
72+
73+
tables.forEach((table, tableIndex) => {
74+
const rows = table.querySelectorAll('tbody tr');
75+
console.error(`Table ${tableIndex}: Found ${rows.length} rows`);
76+
77+
rows.forEach((row, rowIndex) => {
78+
// Look for the row header (th with scope="row") which contains the model name
79+
const rowHeader = row.querySelector('th[scope="row"]');
80+
if (rowHeader) {
81+
let text = rowHeader.textContent.trim();
82+
console.error(`Table ${tableIndex}, Row ${rowIndex}: "${text}"`);
83+
84+
if (text && text.length > 0) {
85+
// Normalize model name: lowercase and replace spaces with dashes
86+
const normalizedName = text.toLowerCase().replace(/\s+/g, '-');
87+
console.error(` Normalized: "${normalizedName}"`);
88+
modelNames.push(normalizedName);
89+
}
90+
} else {
91+
// Fallback to first td if no row header exists
92+
const cells = row.querySelectorAll('td');
93+
if (cells.length > 0) {
94+
let text = cells[0].textContent.trim();
95+
console.error(`Table ${tableIndex}, Row ${rowIndex} (fallback): "${text}"`);
96+
97+
if (text && text.length > 0) {
98+
// Normalize model name: lowercase and replace spaces with dashes
99+
const normalizedName = text.toLowerCase().replace(/\s+/g, '-');
100+
console.error(` Normalized: "${normalizedName}"`);
101+
modelNames.push(normalizedName);
102+
}
103+
}
104+
}
105+
});
106+
});
107+
108+
// Remove duplicates
109+
return [...new Set(modelNames)];
110+
});
111+
112+
// Save only the relevant section HTML for debugging
113+
const relevantHTML = await page.evaluate(() => {
114+
const headings = Array.from(document.querySelectorAll('h2, h3'));
115+
const targetHeading = headings.find(h => h.textContent.includes('Supported AI models in Copilot'));
116+
117+
if (!targetHeading) {
118+
return '<p>Could not find target section</p>';
119+
}
120+
121+
let html = '<h2>' + targetHeading.textContent + '</h2>\n';
122+
let currentElement = targetHeading.nextElementSibling;
123+
124+
while (currentElement && currentElement.tagName !== 'H2') {
125+
html += currentElement.outerHTML + '\n';
126+
currentElement = currentElement.nextElementSibling;
127+
}
128+
129+
return html;
130+
});
131+
132+
fs.writeFileSync('page-content.html', relevantHTML);
133+
console.error('Saved relevant section HTML to page-content.html');
134+
135+
console.error(`Extracted ${models.length} unique models`);
136+
137+
// Save models as JSON
138+
const modelsJson = JSON.stringify(models, null, 2);
139+
fs.writeFileSync('scraped-models.json', modelsJson);
140+
console.error('Saved scraped models to scraped-models.json');
141+
142+
// Output for the workflow
143+
console.log(JSON.stringify(models));
144+
145+
await browser.close();
146+
} catch (error) {
147+
console.error('Error:', error.message);
148+
console.error('Stack trace:', error.stack);
149+
process.exit(1);
150+
}
151+
})();
152+
SCRAPE_EOF
153+
154+
echo "Running scraper..."
155+
node scrape.js 2>&1 | tee scraper.log
156+
157+
# Extract the JSON output (last line)
158+
MODELS_JSON=$(tail -n 1 scraper.log)
159+
echo "Scraped models JSON: $MODELS_JSON"
160+
161+
# Store the models, one per line
162+
echo "$MODELS_JSON" | jq -r '.[]' > models.txt
163+
echo "Models extracted to models.txt:"
164+
cat models.txt
165+
166+
echo "Scraping complete!"

.github/workflows/check-models.yml

Lines changed: 40 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -25,93 +25,19 @@ jobs:
2525
- name: Fetch documentation page
2626
id: fetch_docs
2727
run: |
28-
# Install puppeteer for browser-based scraping
29-
npm install puppeteer
30-
31-
# Create a Node.js script to scrape the page with JavaScript rendering
32-
cat > scrape.js << 'SCRAPE_EOF'
33-
const puppeteer = require('puppeteer');
34-
35-
(async () => {
36-
try {
37-
const browser = await puppeteer.launch({
38-
headless: 'new',
39-
args: ['--no-sandbox', '--disable-setuid-sandbox']
40-
});
41-
const page = await browser.newPage();
42-
43-
console.error('Navigating to page...');
44-
await page.goto('https://docs.github.com/en/copilot/reference/ai-models/supported-models', {
45-
waitUntil: 'networkidle0',
46-
timeout: 60000
47-
});
48-
49-
console.error('Waiting for content to load...');
50-
await page.waitForTimeout(5000);
51-
52-
// Extract model names from the tables
53-
const models = await page.evaluate(() => {
54-
const modelNames = [];
55-
56-
// Find all tables on the page
57-
const tables = document.querySelectorAll('table');
58-
console.error(`Found ${tables.length} tables`);
59-
60-
tables.forEach((table, tableIndex) => {
61-
const rows = table.querySelectorAll('tbody tr');
62-
console.error(`Table ${tableIndex}: Found ${rows.length} rows`);
63-
64-
rows.forEach((row, rowIndex) => {
65-
const cells = row.querySelectorAll('td');
66-
if (cells.length > 0) {
67-
// Get text from first cell and clean it
68-
let text = cells[0].textContent.trim();
69-
console.error(`Table ${tableIndex}, Row ${rowIndex}: "${text}"`);
70-
71-
if (text && text.length > 0) {
72-
modelNames.push(text);
73-
}
74-
}
75-
});
76-
});
77-
78-
// Remove duplicates
79-
return [...new Set(modelNames)];
80-
});
81-
82-
console.error(`Extracted ${models.length} unique models`);
83-
console.log(JSON.stringify(models));
84-
await browser.close();
85-
} catch (error) {
86-
console.error('Error:', error.message);
87-
process.exit(1);
88-
}
89-
})();
90-
SCRAPE_EOF
91-
92-
# Run the script and capture output
93-
MODELS_JSON=$(node scrape.js 2>&1 | tee /dev/stderr | tail -n 1)
94-
echo "Scraped models JSON: $MODELS_JSON"
95-
96-
# Store the models, one per line
97-
echo "$MODELS_JSON" | jq -r '.[]' > models.txt
98-
echo "Models extracted:"
99-
cat models.txt
28+
# Run the scraping script
29+
chmod +x .github/scripts/scrape-models.sh
30+
.github/scripts/scrape-models.sh
10031
101-
- name: List available models from GitHub Models API
102-
id: list_models
103-
env:
104-
GITHUB_TOKEN: ${{ secrets.GH_PAT }}
105-
run: |
106-
# Get list of available models from GitHub Models API
107-
MODELS_CATALOG=$(curl -L -s \
108-
-H "Accept: application/vnd.github+json" \
109-
-H "Authorization: Bearer $GITHUB_TOKEN" \
110-
-H "X-GitHub-Api-Version: 2022-11-28" \
111-
https://models.github.ai/catalog/models)
112-
113-
echo "Available models in GitHub Models API:"
114-
echo "$MODELS_CATALOG" | jq -r '.[].id'
32+
- name: Upload scraped data as artifact
33+
uses: actions/upload-artifact@v4
34+
with:
35+
name: scraped-data
36+
path: |
37+
scraped-models.json
38+
models.txt
39+
page-content.html
40+
scraper.log
11541
11642
- name: Extract models from scraped data
11743
id: fetch_models
@@ -124,8 +50,11 @@ jobs:
12450
12551
MODELS=$(cat models.txt | sort -u)
12652
127-
echo "Fetched Models:"
53+
echo "=== Scraped Models from Documentation ==="
12854
echo "$MODELS"
55+
echo ""
56+
echo "Count: $(echo "$MODELS" | wc -l) models"
57+
echo ""
12958
13059
if [ -z "$MODELS" ]; then
13160
echo "Error: No models extracted"
@@ -187,9 +116,33 @@ jobs:
187116
echo "issue_body<<EOF" >> $GITHUB_OUTPUT
188117
echo -e "$ISSUE_BODY" >> $GITHUB_OUTPUT
189118
echo "EOF" >> $GITHUB_OUTPUT
119+
120+
# Create step summary
121+
echo "## ⚠️ Missing Models Detected" >> $GITHUB_STEP_SUMMARY
122+
echo "" >> $GITHUB_STEP_SUMMARY
123+
if [ -n "$MISSING_ESTIMATORS" ]; then
124+
echo "### 🚨 Missing in \`tokenEstimators.json\`" >> $GITHUB_STEP_SUMMARY
125+
echo -e "$MISSING_ESTIMATORS" >> $GITHUB_STEP_SUMMARY
126+
echo "" >> $GITHUB_STEP_SUMMARY
127+
fi
128+
if [ -n "$MISSING_PRICING" ]; then
129+
echo "### 💰 Missing in \`modelPricing.json\`" >> $GITHUB_STEP_SUMMARY
130+
echo -e "$MISSING_PRICING" >> $GITHUB_STEP_SUMMARY
131+
echo "" >> $GITHUB_STEP_SUMMARY
132+
fi
133+
echo "**Action Required:** Update the JSON configuration files with the latest models." >> $GITHUB_STEP_SUMMARY
134+
echo "" >> $GITHUB_STEP_SUMMARY
135+
echo "[View Documentation](https://docs.github.com/en/copilot/reference/ai-models/supported-models)" >> $GITHUB_STEP_SUMMARY
190136
else
191137
echo "needs_update=false" >> $GITHUB_OUTPUT
192138
echo "✅ All models are up-to-date."
139+
140+
# Create step summary
141+
echo "## ✅ All Models Up-to-Date" >> $GITHUB_STEP_SUMMARY
142+
echo "" >> $GITHUB_STEP_SUMMARY
143+
echo "All models from the documentation are present in both:" >> $GITHUB_STEP_SUMMARY
144+
echo "- \`tokenEstimators.json\`" >> $GITHUB_STEP_SUMMARY
145+
echo "- \`modelPricing.json\`" >> $GITHUB_STEP_SUMMARY
193146
fi
194147
195148
- name: Create GitHub Issue if models are missing

0 commit comments

Comments
 (0)