edbeeching commited on
Commit
860c08d
Β·
2 Parent(s): 899e899 d99c813

Merge branch 'main' of https://huggingface.co/spaces/HuggingFaceH4/syngen

Browse files
Files changed (1) hide show
  1. app.py +299 -67
app.py CHANGED
@@ -8,7 +8,7 @@ from enum import Enum
8
  from datasets import get_dataset_infos
9
  from transformers import AutoConfig
10
  from huggingface_hub import whoami
11
- from typing import Optional, List, Tuple, Union
12
 
13
  """
14
  Still TODO:
@@ -17,6 +17,16 @@ from typing import Optional, List, Tuple, Union
17
  - validate max model params
18
  """
19
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
22
  """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
@@ -49,7 +59,8 @@ class GenerationStatus(Enum):
49
  FAILED = "FAILED"
50
 
51
 
52
- MAX_SAMPLES = 10000 # max number of samples in the input dataset
 
53
  MAX_TOKENS = 8192
54
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
55
 
@@ -79,7 +90,7 @@ class GenerationRequest:
79
  private: bool = False
80
  num_retries: int = 0
81
 
82
- def validate_request(request: GenerationRequest) -> GenerationRequest:
83
  # checks that the request is valid
84
  # - input dataset exists and can be accessed with the provided token
85
  try:
@@ -101,8 +112,17 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
101
 
102
 
103
 
104
- if request.num_output_examples > MAX_SAMPLES:
105
- raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {MAX_SAMPLES}.")
 
 
 
 
 
 
 
 
 
106
 
107
  # check the prompt column exists in the dataset
108
  if request.prompt_column not in input_dataset_info.features:
@@ -117,8 +137,34 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
117
  try:
118
  output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
119
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
120
- except Exception as e:
121
  pass # dataset does not exist, which is expected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  # check the models exists
124
  try:
@@ -145,7 +191,7 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
145
  if request.top_p < 0.0 or request.top_p > 1.0:
146
  raise Exception("Top P must be between 0.0 and 1.0")
147
 
148
- # check valid email address TODO: use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
149
  if "@" not in request.email or "." not in request.email.split("@")[-1]:
150
  raise Exception("Invalid email address")
151
 
@@ -197,14 +243,58 @@ def add_request_to_db(request: GenerationRequest):
197
 
198
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  def main():
203
  with gr.Blocks(title="Synthetic Data Generation") as demo:
204
- gr.HTML("<h3 style='text-align:center'>Hugging Face PRO users can use the Synthetic generation service. <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Subscribe to PRO</a></h3>", elem_id="sub_title")
 
 
 
 
 
 
205
 
206
- pro_message = gr.Markdown(visible=False)
207
  main_interface = gr.Column(visible=False)
 
 
 
 
208
  with main_interface:
209
  with gr.Group():
210
  with gr.Row():
@@ -214,7 +304,7 @@ def main():
214
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
215
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
216
  """)
217
- with gr.Group():
218
  with gr.Row():
219
  gr.Markdown("""
220
  **How it works:**
@@ -227,59 +317,169 @@ def main():
227
 
228
  **Requirements:**
229
  - Input dataset must be publicly accessible
230
- - Model must be accessible (public and not gated)
231
  - Maximum 10,000 samples per dataset
232
- - Maximum of 8192 generation tokens
233
  """)
234
 
235
- with gr.Group():
236
- gr.Markdown("## Dataset information")
237
- with gr.Column():
238
- with gr.Row():
239
- input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
240
- input_dataset_split = gr.Textbox(label="Input Dataset Split", value="train", placeholder="e.g., train, test, validation")
241
- input_dataset_config = gr.Textbox(label="Input Dataset Config", value="default", placeholder="e.g., default, custom")
242
- prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- with gr.Column():
245
  with gr.Row():
246
- output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'")
247
- num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES, step=1)
248
- with gr.Group():
249
- gr.Markdown("## Model information")
250
- with gr.Column():
 
 
 
 
251
  with gr.Row():
252
- model_name_or_path = gr.Textbox(label="Model Name or Path", placeholder="e.g., Qwen/Qwen3-4B-Instruct-2507")
253
- model_revision = gr.Textbox(label="Model Revision", value="main", placeholder="e.g., main, v1.0")
254
- # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
255
- with gr.Group():
256
- gr.Markdown("## Generation Parameters")
257
- with gr.Row():
258
- with gr.Column():
259
- with gr.Row():
260
- max_tokens = gr.Slider(label="Max Tokens", value=512, minimum=256, maximum=MAX_TOKENS, step=256)
261
- temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
262
- with gr.Row():
263
- top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
264
- top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
265
- with gr.Row():
266
- system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
267
 
268
- with gr.Group():
269
- gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
270
- with gr.Row():
271
- with gr.Column():
272
  with gr.Row():
273
- email = gr.Textbox(label="Email", placeholder="[email protected]")
274
- # with gr.Row():
275
- # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
276
- # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
277
-
278
- submit_btn = gr.Button("Submit Generation Request", variant="primary")
279
- output_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
- def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, model_rev, sys_prompt,
282
- max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
283
 
284
  MASTER_ORG = "synthetic-data-universe/"
285
  model_token = False # This is currently not supported
@@ -297,7 +497,7 @@ def main():
297
  output_dataset_name=MASTER_ORG + output_dataset_name,
298
  prompt_column=prompt_col,
299
  model_name_or_path=model_name,
300
- model_revision=model_rev,
301
  model_token=model_token,
302
  system_prompt=sys_prompt if sys_prompt else None,
303
  max_tokens=int(max_tok),
@@ -312,7 +512,7 @@ def main():
312
  )
313
 
314
  # check the input dataset exists and can be accessed with the provided token
315
- request = validate_request(request)
316
  add_request_to_db(request)
317
 
318
  return "Request submitted successfully!"
@@ -322,25 +522,57 @@ def main():
322
  submit_btn.click(
323
  submit_request,
324
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
325
- model_revision, system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples],
326
  outputs=output_status
327
  )
328
 
 
 
 
 
 
 
 
 
 
 
329
  def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
330
- if not profile: return gr.update(visible=False), gr.update(visible=False)
331
- if verify_pro_status(oauth_token): return gr.update(visible=True), gr.update(visible=False)
 
 
 
 
 
 
 
 
 
332
  else:
333
- message = (
334
- "## ✨ Exclusive Access for PRO Users\n\n"
335
- "Thank you for your interest! This app is available exclusively for our Hugging Face **PRO** members.\n\n"
336
- "To unlock this and many other cool stuff, please consider upgrading your account.\n\n"
337
- "### [**Become a PRO Today!**](http://huggingface.co/subscribe/pro?source=synthetic-data-universe)"
 
 
 
 
 
 
 
 
 
 
 
 
338
  )
339
- return gr.update(visible=False), gr.update(visible=True, value=message)
340
 
341
- login_button = gr.LoginButton() # this is required or AUTH will not work
342
 
343
- demo.load(control_access, inputs=None, outputs=[main_interface, pro_message])
 
 
 
344
  demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
345
 
346
  if __name__ == "__main__":
 
8
  from datasets import get_dataset_infos
9
  from transformers import AutoConfig
10
  from huggingface_hub import whoami
11
+ from typing import Optional, Union
12
 
13
  """
14
  Still TODO:
 
17
  - validate max model params
18
  """
19
 
20
+ SUPPORTED_MODELS = [
21
+ "Qwen/Qwen3-4B-Instruct-2507",
22
+ "Qwen/Qwen3-30B-A3B-Instruct-2507",
23
+ "meta-llama/Llama-3.2-1B-Instruct",
24
+ "meta-llama/Llama-3.2-3B-Instruct",
25
+ "baidu/ERNIE-4.5-21B-A3B-Thinking",
26
+ "LLM360/K2-Think",
27
+ "openai/gpt-oss-20b",
28
+ ]
29
+
30
 
31
  def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
32
  """Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
 
59
  FAILED = "FAILED"
60
 
61
 
62
+ MAX_SAMPLES_PRO = 10000 # max number of samples for PRO/Enterprise users
63
+ MAX_SAMPLES_FREE = 100 # max number of samples for free users
64
  MAX_TOKENS = 8192
65
  MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
66
 
 
90
  private: bool = False
91
  num_retries: int = 0
92
 
93
+ def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
94
  # checks that the request is valid
95
  # - input dataset exists and can be accessed with the provided token
96
  try:
 
112
 
113
 
114
 
115
+ # Check user tier and apply appropriate limits
116
+ # Anonymous users (oauth_token is None) are treated as free tier
117
+ is_pro = verify_pro_status(oauth_token) if oauth_token else False
118
+ max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
119
+
120
+ if request.num_output_examples > max_samples:
121
+ if oauth_token is None:
122
+ user_tier = "non-signed-in"
123
+ else:
124
+ user_tier = "PRO/Enterprise" if is_pro else "free"
125
+ raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")
126
 
127
  # check the prompt column exists in the dataset
128
  if request.prompt_column not in input_dataset_info.features:
 
137
  try:
138
  output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
139
  raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
140
+ except Exception:
141
  pass # dataset does not exist, which is expected
142
+
143
+ # check the output dataset name doesn't already exist in the database
144
+ try:
145
+ url = os.getenv("SUPABASE_URL")
146
+ key = os.getenv("SUPABASE_KEY")
147
+
148
+ if url and key:
149
+ supabase = create_client(
150
+ url,
151
+ key,
152
+ options=ClientOptions(
153
+ postgrest_client_timeout=10,
154
+ storage_client_timeout=10,
155
+ schema="public",
156
+ )
157
+ )
158
+
159
+ existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
160
+ if existing_request.data:
161
+ raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
162
+ except Exception as e:
163
+ # If it's our custom exception about dataset already existing, re-raise it
164
+ if "already being generated" in str(e):
165
+ raise e
166
+ # Otherwise, ignore database connection errors and continue
167
+ pass
168
 
169
  # check the models exists
170
  try:
 
191
  if request.top_p < 0.0 or request.top_p > 1.0:
192
  raise Exception("Top P must be between 0.0 and 1.0")
193
 
194
+ # check valid email address TODO: could use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
195
  if "@" not in request.email or "." not in request.email.split("@")[-1]:
196
  raise Exception("Invalid email address")
197
 
 
243
 
244
 
245
 
246
+ def get_generation_stats_safe():
247
+ """Safely fetch generation request statistics with proper error handling"""
248
+ try:
249
+ url = os.getenv("SUPABASE_URL")
250
+ key = os.getenv("SUPABASE_KEY")
251
+
252
+ if not url or not key:
253
+ raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
254
+
255
+ supabase = create_client(
256
+ url,
257
+ key,
258
+ options=ClientOptions(
259
+ postgrest_client_timeout=10,
260
+ storage_client_timeout=10,
261
+ schema="public",
262
+ )
263
+ )
264
+
265
+ # Fetch data excluding sensitive token fields
266
+ response = supabase.table("gen-requests").select(
267
+ "id, created_at, status, input_dataset_name, input_dataset_config, "
268
+ "input_dataset_split, output_dataset_name, prompt_column, "
269
+ "model_name_or_path, model_revision, max_tokens, temperature, "
270
+ "top_k, top_p, username, num_output_examples, private"
271
+ ).order("created_at", desc=True).limit(50).execute()
272
+
273
+ return {"status": "success", "data": response.data}
274
+
275
+ except Exception as e:
276
+ return {"status": "error", "message": str(e), "data": []}
277
+
278
+
279
+ # Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()
280
 
281
 
282
  def main():
283
  with gr.Blocks(title="Synthetic Data Generation") as demo:
284
+ gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
285
+
286
+ # Add sign-in button at the top
287
+ with gr.Row():
288
+ gr.Markdown("") # Empty space for alignment
289
+ login_button = gr.LoginButton(value="πŸ”‘ Sign in", size="sm")
290
+ gr.Markdown("") # Empty space for alignment
291
 
292
+ signin_message = gr.Markdown("## πŸ”‘ Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
293
  main_interface = gr.Column(visible=False)
294
+
295
+ # Store the current oauth token for use in submit_request
296
+ current_oauth_token = gr.State(None)
297
+
298
  with main_interface:
299
  with gr.Group():
300
  with gr.Row():
 
304
  Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
305
  Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
306
  """)
307
+ with gr.Accordion("More Information", open=False):
308
  with gr.Row():
309
  gr.Markdown("""
310
  **How it works:**
 
317
 
318
  **Requirements:**
319
  - Input dataset must be publicly accessible
320
+ - Model must be publicly accessible (and not gated)
321
  - Maximum 10,000 samples per dataset
322
+ - Maximum of 8192 generated tokens
323
  """)
324
 
325
+ with gr.Tabs():
326
+ with gr.TabItem("Generate Synthetic Data"):
327
+ with gr.Group():
328
+ gr.Markdown("## Model information")
329
+ with gr.Column():
330
+ with gr.Row():
331
+ model_name_or_path = gr.Dropdown(
332
+ choices=SUPPORTED_MODELS,
333
+ label="Select Model",
334
+ value="Qwen/Qwen3-4B-Instruct-2507",
335
+ info="Choose from popular instruction-tuned models under 40B parameters"
336
+ )
337
+ # model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
338
+ with gr.Group():
339
+ gr.Markdown("## Dataset information")
340
+ # Dynamic user limit info - default to anonymous user
341
+ user_limit_info = gr.Markdown(value="πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
342
+ with gr.Row():
343
+ with gr.Column():
344
+ input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
345
+ prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question", value=None, interactive=False, info="Click Load Info to populate")
346
+
347
+ with gr.Column():
348
+ output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
349
 
 
350
  with gr.Row():
351
+ with gr.Column():
352
+ input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
353
+ prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
354
+
355
+ with gr.Column():
356
+ input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
357
+ num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
358
+
359
+ gr.Markdown("### Generation Parameters")
360
  with gr.Row():
361
+ with gr.Column():
362
+ with gr.Row():
363
+ max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
364
+ temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
365
+ with gr.Row():
366
+ top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
367
+ top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
368
+ with gr.Row():
369
+ system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
 
 
 
 
 
 
370
 
371
+ with gr.Group():
372
+ gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
 
 
373
  with gr.Row():
374
+ with gr.Column():
375
+ with gr.Row():
376
+ email = gr.Textbox(label="Email", placeholder="[email protected]")
377
+ # with gr.Row():
378
+ # input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
379
+ # output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
380
+
381
+ submit_btn = gr.Button("Submit Generation Request", variant="primary")
382
+ output_status = gr.Textbox(label="Status", interactive=False)
383
+
384
+ with gr.TabItem("Statistics Dashboard"):
385
+ gr.Markdown("## Generation Requests Statistics")
386
+ gr.Markdown("πŸ“Š View recent synthetic data generation requests and their status.")
387
+
388
+ with gr.Row():
389
+ refresh_stats_btn = gr.Button("πŸ”„ Refresh Statistics", size="sm", variant="secondary")
390
+ clear_stats_btn = gr.Button("πŸ—‘οΈ Clear Display", size="sm")
391
+
392
+ stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
393
+
394
+ stats_dataframe = gr.Dataframe(
395
+ headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
396
+ datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
397
+ interactive=False,
398
+ wrap=True,
399
+ value=[],
400
+ label="Recent Generation Requests (Last 50)",
401
+ visible=False
402
+ )
403
+
404
+ def load_statistics():
405
+ """Load and format statistics data"""
406
+ try:
407
+ # Use the new safe database function
408
+ result = get_generation_stats_safe()
409
+
410
+ if result["status"] == "error":
411
+ return (
412
+ f"❌ **Error loading statistics**: {result['message']}",
413
+ gr.update(visible=False),
414
+ gr.update(visible=True)
415
+ )
416
+
417
+ data = result["data"]
418
+ if not data:
419
+ return (
420
+ "πŸ“ **No data found**: The database appears to be empty or the table doesn't exist yet.",
421
+ gr.update(visible=False),
422
+ gr.update(visible=True)
423
+ )
424
+
425
+ # Format data for display
426
+ formatted_data = []
427
+ for item in data:
428
+ # Format timestamp
429
+ created_at = item.get('created_at', 'Unknown')
430
+ if created_at and created_at != 'Unknown':
431
+ try:
432
+ from datetime import datetime
433
+ dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
434
+ created_at = dt.strftime('%Y-%m-%d %H:%M')
435
+ except:
436
+ pass
437
+
438
+ formatted_data.append([
439
+ str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
440
+ created_at,
441
+ item.get('status', 'Unknown'),
442
+ (item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
443
+ (item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
444
+ (item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
445
+ item.get('num_output_examples', 0),
446
+ item.get('username', 'Anonymous')
447
+ ])
448
+
449
+ return (
450
+ f"βœ… **Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
451
+ gr.update(value=formatted_data, visible=True),
452
+ gr.update(visible=True)
453
+ )
454
+
455
+ except Exception as e:
456
+ return (
457
+ f"❌ **Unexpected error**: {str(e)}",
458
+ gr.update(visible=False),
459
+ gr.update(visible=True)
460
+ )
461
+
462
+ def clear_statistics():
463
+ """Clear the statistics display"""
464
+ return (
465
+ "Click 'Refresh Statistics' to load recent generation requests.",
466
+ gr.update(value=[], visible=False),
467
+ gr.update(visible=True)
468
+ )
469
+
470
+ # Connect buttons to functions
471
+ refresh_stats_btn.click(
472
+ load_statistics,
473
+ outputs=[stats_status, stats_dataframe, stats_status]
474
+ )
475
+
476
+ clear_stats_btn.click(
477
+ clear_statistics,
478
+ outputs=[stats_status, stats_dataframe, stats_status]
479
+ )
480
 
481
+ def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, sys_prompt,
482
+ max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
483
 
484
  MASTER_ORG = "synthetic-data-universe/"
485
  model_token = False # This is currently not supported
 
497
  output_dataset_name=MASTER_ORG + output_dataset_name,
498
  prompt_column=prompt_col,
499
  model_name_or_path=model_name,
500
+ model_revision="main",
501
  model_token=model_token,
502
  system_prompt=sys_prompt if sys_prompt else None,
503
  max_tokens=int(max_tok),
 
512
  )
513
 
514
  # check the input dataset exists and can be accessed with the provided token
515
+ request = validate_request(request, oauth_token)
516
  add_request_to_db(request)
517
 
518
  return "Request submitted successfully!"
 
522
  submit_btn.click(
523
  submit_request,
524
  inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
525
+ system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
526
  outputs=output_status
527
  )
528
 
529
+ def update_user_limits(oauth_token):
530
+ if oauth_token is None:
531
+ return "πŸ‘€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)."
532
+
533
+ is_pro = verify_pro_status(oauth_token)
534
+ if is_pro:
535
+ return "✨ **PRO User**: You can generate up to 10,000 samples per request."
536
+ else:
537
+ return "πŸ‘€ **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."
538
+
539
  def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
540
+ # Require users to be signed in
541
+ if oauth_token is None:
542
+ # User is not signed in - show sign-in prompt, hide main interface
543
+ return (
544
+ gr.update(visible=False), # main_interface
545
+ gr.update(visible=True), # signin_message
546
+ oauth_token, # current_oauth_token
547
+ "", # user_limit_info (empty when not signed in)
548
+ gr.update(), # num_output_samples (no change)
549
+ gr.update(value="πŸ”‘ Sign in") # login_button
550
+ )
551
  else:
552
+ # User is signed in - show main interface, hide sign-in prompt
553
+ limit_msg = update_user_limits(oauth_token)
554
+ is_pro = verify_pro_status(oauth_token)
555
+ max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
556
+
557
+ if is_pro:
558
+ button_text = f"✨ Signed in as PRO ({profile.name if profile else 'User'})"
559
+ else:
560
+ button_text = f"πŸ‘€ Signed in as {profile.name if profile else 'User'}"
561
+
562
+ return (
563
+ gr.update(visible=True), # main_interface
564
+ gr.update(visible=False), # signin_message
565
+ oauth_token, # current_oauth_token
566
+ limit_msg, # user_limit_info
567
+ gr.update(maximum=max_samples), # num_output_samples
568
+ gr.update(value=button_text) # login_button
569
  )
 
570
 
 
571
 
572
+ # Handle login state changes - LoginButton automatically handles auth state changes
573
+ # The demo.load will handle both initial load and auth changes
574
+
575
+ demo.load(control_access, inputs=None, outputs=[main_interface, signin_message, current_oauth_token, user_limit_info, num_output_samples, login_button])
576
  demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
577
 
578
  if __name__ == "__main__":