Spaces:
Sleeping
Sleeping
Improve scraper robustness for Great Bear (YCGBR)
Browse filesChanges:
- Increased page load timeout to 60s
- Increased table wait timeout to 45s
- Added longer sleep (8s) for dynamic content
- Improved table detection to look for 'Date/Time' or 'Date' headers
- Added console logging for debugging
- Changed Great Bear ID back to YCGBR with better scraper
The scraper now waits longer and is more flexible in detecting data tables,
which should resolve issues with slower-loading stations like Great Bear.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <[email protected]>
app.py
CHANGED
|
@@ -46,35 +46,46 @@ def scrape_weather_data(site_id, hours=720):
|
|
| 46 |
)
|
| 47 |
|
| 48 |
page = context.new_page()
|
| 49 |
-
response = page.goto(url)
|
| 50 |
print(f"Response status: {response.status}")
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
| 55 |
print("Extracting data...")
|
| 56 |
content = page.evaluate('''() => {
|
| 57 |
const getTextContent = () => {
|
| 58 |
const rows = [];
|
| 59 |
const tables = document.getElementsByTagName('table');
|
|
|
|
|
|
|
| 60 |
for (const table of tables) {
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
const headerRow = Array.from(table.querySelectorAll('th'))
|
| 63 |
.map(th => th.textContent.trim());
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
const dataRows = Array.from(table.querySelectorAll('tbody tr'))
|
| 66 |
.map(row => Array.from(row.querySelectorAll('td'))
|
| 67 |
.map(td => td.textContent.trim()));
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
}
|
| 71 |
}
|
| 72 |
return null;
|
| 73 |
};
|
| 74 |
-
|
| 75 |
return getTextContent();
|
| 76 |
}''')
|
| 77 |
-
|
| 78 |
print(f"Found {len(content['rows'] if content else [])} rows of data")
|
| 79 |
browser.close()
|
| 80 |
|
|
@@ -421,7 +432,7 @@ with gr.Blocks(title="Weather Station Data Analyzer") as demo:
|
|
| 421 |
("Yellowstone Club - Andesite", "YCAND"),
|
| 422 |
("Yellowstone Club - American Spirit", "YCAMS"),
|
| 423 |
("Yellowstone Club - Base", "YCBAS"),
|
| 424 |
-
("Yellowstone Club - Great Bear", "
|
| 425 |
("Bozeman Airport", "KBZN"),
|
| 426 |
("Salt Lake City", "KSLC")
|
| 427 |
],
|
|
|
|
| 46 |
)
|
| 47 |
|
| 48 |
page = context.new_page()
|
| 49 |
+
response = page.goto(url, timeout=60000)
|
| 50 |
print(f"Response status: {response.status}")
|
| 51 |
+
|
| 52 |
+
# Wait for table to load with longer timeout
|
| 53 |
+
page.wait_for_selector('table', timeout=45000)
|
| 54 |
+
time.sleep(8) # Give more time for dynamic content
|
| 55 |
+
|
| 56 |
print("Extracting data...")
|
| 57 |
content = page.evaluate('''() => {
|
| 58 |
const getTextContent = () => {
|
| 59 |
const rows = [];
|
| 60 |
const tables = document.getElementsByTagName('table');
|
| 61 |
+
console.log(`Found ${tables.length} tables`);
|
| 62 |
+
|
| 63 |
for (const table of tables) {
|
| 64 |
+
const text = table.textContent;
|
| 65 |
+
// Look for Date/Time or just Date as header
|
| 66 |
+
if (text.includes('Date/Time') || text.includes('Date')) {
|
| 67 |
const headerRow = Array.from(table.querySelectorAll('th'))
|
| 68 |
.map(th => th.textContent.trim());
|
| 69 |
+
|
| 70 |
+
console.log('Headers:', headerRow);
|
| 71 |
+
|
| 72 |
const dataRows = Array.from(table.querySelectorAll('tbody tr'))
|
| 73 |
.map(row => Array.from(row.querySelectorAll('td'))
|
| 74 |
.map(td => td.textContent.trim()));
|
| 75 |
+
|
| 76 |
+
console.log(`Found ${dataRows.length} data rows`);
|
| 77 |
+
|
| 78 |
+
if (dataRows.length > 0) {
|
| 79 |
+
return {headers: headerRow, rows: dataRows};
|
| 80 |
+
}
|
| 81 |
}
|
| 82 |
}
|
| 83 |
return null;
|
| 84 |
};
|
| 85 |
+
|
| 86 |
return getTextContent();
|
| 87 |
}''')
|
| 88 |
+
|
| 89 |
print(f"Found {len(content['rows'] if content else [])} rows of data")
|
| 90 |
browser.close()
|
| 91 |
|
|
|
|
| 432 |
("Yellowstone Club - Andesite", "YCAND"),
|
| 433 |
("Yellowstone Club - American Spirit", "YCAMS"),
|
| 434 |
("Yellowstone Club - Base", "YCBAS"),
|
| 435 |
+
("Yellowstone Club - Great Bear", "YCGBR"),
|
| 436 |
("Bozeman Airport", "KBZN"),
|
| 437 |
("Salt Lake City", "KSLC")
|
| 438 |
],
|