nakas Claude commited on
Commit
2ac229b
·
1 Parent(s): 70985c2

Improve scraper robustness for Great Bear (YCGBR)

Browse files

Changes:
- Increased page load timeout to 60s
- Increased table wait timeout to 45s
- Added longer sleep (8s) for dynamic content
- Improved table detection to look for 'Date/Time' or 'Date' headers
- Added console logging for debugging
- Changed Great Bear ID back to YCGBR with better scraper

The scraper now waits longer and is more flexible in detecting data tables,
which should resolve issues with slower-loading stations like Great Bear.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show
  1. app.py +23 -12
app.py CHANGED
@@ -46,35 +46,46 @@ def scrape_weather_data(site_id, hours=720):
46
  )
47
 
48
  page = context.new_page()
49
- response = page.goto(url)
50
  print(f"Response status: {response.status}")
51
-
52
- page.wait_for_selector('table', timeout=30000)
53
- time.sleep(5)
54
-
 
55
  print("Extracting data...")
56
  content = page.evaluate('''() => {
57
  const getTextContent = () => {
58
  const rows = [];
59
  const tables = document.getElementsByTagName('table');
 
 
60
  for (const table of tables) {
61
- if (table.textContent.includes('Date/Time')) {
 
 
62
  const headerRow = Array.from(table.querySelectorAll('th'))
63
  .map(th => th.textContent.trim());
64
-
 
 
65
  const dataRows = Array.from(table.querySelectorAll('tbody tr'))
66
  .map(row => Array.from(row.querySelectorAll('td'))
67
  .map(td => td.textContent.trim()));
68
-
69
- return {headers: headerRow, rows: dataRows};
 
 
 
 
70
  }
71
  }
72
  return null;
73
  };
74
-
75
  return getTextContent();
76
  }''')
77
-
78
  print(f"Found {len(content['rows'] if content else [])} rows of data")
79
  browser.close()
80
 
@@ -421,7 +432,7 @@ with gr.Blocks(title="Weather Station Data Analyzer") as demo:
421
  ("Yellowstone Club - Andesite", "YCAND"),
422
  ("Yellowstone Club - American Spirit", "YCAMS"),
423
  ("Yellowstone Club - Base", "YCBAS"),
424
- ("Yellowstone Club - Great Bear", "GBRM"),
425
  ("Bozeman Airport", "KBZN"),
426
  ("Salt Lake City", "KSLC")
427
  ],
 
46
  )
47
 
48
  page = context.new_page()
49
+ response = page.goto(url, timeout=60000)
50
  print(f"Response status: {response.status}")
51
+
52
+ # Wait for table to load with longer timeout
53
+ page.wait_for_selector('table', timeout=45000)
54
+ time.sleep(8) # Give more time for dynamic content
55
+
56
  print("Extracting data...")
57
  content = page.evaluate('''() => {
58
  const getTextContent = () => {
59
  const rows = [];
60
  const tables = document.getElementsByTagName('table');
61
+ console.log(`Found ${tables.length} tables`);
62
+
63
  for (const table of tables) {
64
+ const text = table.textContent;
65
+ // Look for Date/Time or just Date as header
66
+ if (text.includes('Date/Time') || text.includes('Date')) {
67
  const headerRow = Array.from(table.querySelectorAll('th'))
68
  .map(th => th.textContent.trim());
69
+
70
+ console.log('Headers:', headerRow);
71
+
72
  const dataRows = Array.from(table.querySelectorAll('tbody tr'))
73
  .map(row => Array.from(row.querySelectorAll('td'))
74
  .map(td => td.textContent.trim()));
75
+
76
+ console.log(`Found ${dataRows.length} data rows`);
77
+
78
+ if (dataRows.length > 0) {
79
+ return {headers: headerRow, rows: dataRows};
80
+ }
81
  }
82
  }
83
  return null;
84
  };
85
+
86
  return getTextContent();
87
  }''')
88
+
89
  print(f"Found {len(content['rows'] if content else [])} rows of data")
90
  browser.close()
91
 
 
432
  ("Yellowstone Club - Andesite", "YCAND"),
433
  ("Yellowstone Club - American Spirit", "YCAMS"),
434
  ("Yellowstone Club - Base", "YCBAS"),
435
+ ("Yellowstone Club - Great Bear", "YCGBR"),
436
  ("Bozeman Airport", "KBZN"),
437
  ("Salt Lake City", "KSLC")
438
  ],