Spaces:
Running
Running
Merge branch 'main' of https://huggingface.co/spaces/HuggingFaceH4/syngen
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from enum import Enum
|
|
| 8 |
from datasets import get_dataset_infos
|
| 9 |
from transformers import AutoConfig
|
| 10 |
from huggingface_hub import whoami
|
| 11 |
-
from typing import Optional,
|
| 12 |
|
| 13 |
"""
|
| 14 |
Still TODO:
|
|
@@ -17,6 +17,16 @@ from typing import Optional, List, Tuple, Union
|
|
| 17 |
- validate max model params
|
| 18 |
"""
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
|
| 22 |
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
|
|
@@ -49,7 +59,8 @@ class GenerationStatus(Enum):
|
|
| 49 |
FAILED = "FAILED"
|
| 50 |
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
MAX_TOKENS = 8192
|
| 54 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 55 |
|
|
@@ -79,7 +90,7 @@ class GenerationRequest:
|
|
| 79 |
private: bool = False
|
| 80 |
num_retries: int = 0
|
| 81 |
|
| 82 |
-
def validate_request(request: GenerationRequest) -> GenerationRequest:
|
| 83 |
# checks that the request is valid
|
| 84 |
# - input dataset exists and can be accessed with the provided token
|
| 85 |
try:
|
|
@@ -101,8 +112,17 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
|
|
| 101 |
|
| 102 |
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
# check the prompt column exists in the dataset
|
| 108 |
if request.prompt_column not in input_dataset_info.features:
|
|
@@ -117,8 +137,34 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
|
|
| 117 |
try:
|
| 118 |
output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
|
| 119 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 120 |
-
except Exception
|
| 121 |
pass # dataset does not exist, which is expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
# check the models exists
|
| 124 |
try:
|
|
@@ -145,7 +191,7 @@ def validate_request(request: GenerationRequest) -> GenerationRequest:
|
|
| 145 |
if request.top_p < 0.0 or request.top_p > 1.0:
|
| 146 |
raise Exception("Top P must be between 0.0 and 1.0")
|
| 147 |
|
| 148 |
-
# check valid email address TODO: use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
|
| 149 |
if "@" not in request.email or "." not in request.email.split("@")[-1]:
|
| 150 |
raise Exception("Invalid email address")
|
| 151 |
|
|
@@ -197,14 +243,58 @@ def add_request_to_db(request: GenerationRequest):
|
|
| 197 |
|
| 198 |
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
|
| 202 |
def main():
|
| 203 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 204 |
-
gr.HTML("<h3 style='text-align:center'>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
-
|
| 207 |
main_interface = gr.Column(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
with main_interface:
|
| 209 |
with gr.Group():
|
| 210 |
with gr.Row():
|
|
@@ -214,7 +304,7 @@ def main():
|
|
| 214 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 215 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 216 |
""")
|
| 217 |
-
with gr.
|
| 218 |
with gr.Row():
|
| 219 |
gr.Markdown("""
|
| 220 |
**How it works:**
|
|
@@ -227,59 +317,169 @@ def main():
|
|
| 227 |
|
| 228 |
**Requirements:**
|
| 229 |
- Input dataset must be publicly accessible
|
| 230 |
-
- Model must be accessible (
|
| 231 |
- Maximum 10,000 samples per dataset
|
| 232 |
-
- Maximum of 8192
|
| 233 |
""")
|
| 234 |
|
| 235 |
-
with gr.
|
| 236 |
-
gr.
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
-
with gr.Column():
|
| 245 |
with gr.Row():
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
with gr.Row():
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 262 |
-
with gr.Row():
|
| 263 |
-
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 264 |
-
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 265 |
-
with gr.Row():
|
| 266 |
-
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
with gr.Row():
|
| 271 |
-
with gr.Column():
|
| 272 |
with gr.Row():
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
-
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name,
|
| 282 |
-
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples):
|
| 283 |
|
| 284 |
MASTER_ORG = "synthetic-data-universe/"
|
| 285 |
model_token = False # This is currently not supported
|
|
@@ -297,7 +497,7 @@ def main():
|
|
| 297 |
output_dataset_name=MASTER_ORG + output_dataset_name,
|
| 298 |
prompt_column=prompt_col,
|
| 299 |
model_name_or_path=model_name,
|
| 300 |
-
model_revision=
|
| 301 |
model_token=model_token,
|
| 302 |
system_prompt=sys_prompt if sys_prompt else None,
|
| 303 |
max_tokens=int(max_tok),
|
|
@@ -312,7 +512,7 @@ def main():
|
|
| 312 |
)
|
| 313 |
|
| 314 |
# check the input dataset exists and can be accessed with the provided token
|
| 315 |
-
request = validate_request(request)
|
| 316 |
add_request_to_db(request)
|
| 317 |
|
| 318 |
return "Request submitted successfully!"
|
|
@@ -322,25 +522,57 @@ def main():
|
|
| 322 |
submit_btn.click(
|
| 323 |
submit_request,
|
| 324 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 325 |
-
|
| 326 |
outputs=output_status
|
| 327 |
)
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
|
| 330 |
-
|
| 331 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
else:
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
)
|
| 339 |
-
return gr.update(visible=False), gr.update(visible=True, value=message)
|
| 340 |
|
| 341 |
-
login_button = gr.LoginButton() # this is required or AUTH will not work
|
| 342 |
|
| 343 |
-
|
|
|
|
|
|
|
|
|
|
| 344 |
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
|
| 345 |
|
| 346 |
if __name__ == "__main__":
|
|
|
|
| 8 |
from datasets import get_dataset_infos
|
| 9 |
from transformers import AutoConfig
|
| 10 |
from huggingface_hub import whoami
|
| 11 |
+
from typing import Optional, Union
|
| 12 |
|
| 13 |
"""
|
| 14 |
Still TODO:
|
|
|
|
| 17 |
- validate max model params
|
| 18 |
"""
|
| 19 |
|
| 20 |
+
SUPPORTED_MODELS = [
|
| 21 |
+
"Qwen/Qwen3-4B-Instruct-2507",
|
| 22 |
+
"Qwen/Qwen3-30B-A3B-Instruct-2507",
|
| 23 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
| 24 |
+
"meta-llama/Llama-3.2-3B-Instruct",
|
| 25 |
+
"baidu/ERNIE-4.5-21B-A3B-Thinking",
|
| 26 |
+
"LLM360/K2-Think",
|
| 27 |
+
"openai/gpt-oss-20b",
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
|
| 31 |
def verify_pro_status(token: Optional[Union[gr.OAuthToken, str]]) -> bool:
|
| 32 |
"""Verifies if the user is a Hugging Face PRO user or part of an enterprise org."""
|
|
|
|
| 59 |
FAILED = "FAILED"
|
| 60 |
|
| 61 |
|
| 62 |
+
MAX_SAMPLES_PRO = 10000 # max number of samples for PRO/Enterprise users
|
| 63 |
+
MAX_SAMPLES_FREE = 100 # max number of samples for free users
|
| 64 |
MAX_TOKENS = 8192
|
| 65 |
MAX_MODEL_PARAMS = 20_000_000_000 # 20 billion parameters (for now)
|
| 66 |
|
|
|
|
| 90 |
private: bool = False
|
| 91 |
num_retries: int = 0
|
| 92 |
|
| 93 |
+
def validate_request(request: GenerationRequest, oauth_token: Optional[Union[gr.OAuthToken, str]] = None) -> GenerationRequest:
|
| 94 |
# checks that the request is valid
|
| 95 |
# - input dataset exists and can be accessed with the provided token
|
| 96 |
try:
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
|
| 115 |
+
# Check user tier and apply appropriate limits
|
| 116 |
+
# Anonymous users (oauth_token is None) are treated as free tier
|
| 117 |
+
is_pro = verify_pro_status(oauth_token) if oauth_token else False
|
| 118 |
+
max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
|
| 119 |
+
|
| 120 |
+
if request.num_output_examples > max_samples:
|
| 121 |
+
if oauth_token is None:
|
| 122 |
+
user_tier = "non-signed-in"
|
| 123 |
+
else:
|
| 124 |
+
user_tier = "PRO/Enterprise" if is_pro else "free"
|
| 125 |
+
raise Exception(f"Requested number of output examples {request.num_output_examples} exceeds the max limit of {max_samples} for {user_tier} users.")
|
| 126 |
|
| 127 |
# check the prompt column exists in the dataset
|
| 128 |
if request.prompt_column not in input_dataset_info.features:
|
|
|
|
| 137 |
try:
|
| 138 |
output_dataset_info = get_dataset_infos(request.output_dataset_name, token=request.output_dataset_token)
|
| 139 |
raise Exception(f"Output dataset {request.output_dataset_name} already exists. Please choose a different name.")
|
| 140 |
+
except Exception:
|
| 141 |
pass # dataset does not exist, which is expected
|
| 142 |
+
|
| 143 |
+
# check the output dataset name doesn't already exist in the database
|
| 144 |
+
try:
|
| 145 |
+
url = os.getenv("SUPABASE_URL")
|
| 146 |
+
key = os.getenv("SUPABASE_KEY")
|
| 147 |
+
|
| 148 |
+
if url and key:
|
| 149 |
+
supabase = create_client(
|
| 150 |
+
url,
|
| 151 |
+
key,
|
| 152 |
+
options=ClientOptions(
|
| 153 |
+
postgrest_client_timeout=10,
|
| 154 |
+
storage_client_timeout=10,
|
| 155 |
+
schema="public",
|
| 156 |
+
)
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
existing_request = supabase.table("gen-requests").select("id").eq("output_dataset_name", request.output_dataset_name).execute()
|
| 160 |
+
if existing_request.data:
|
| 161 |
+
raise Exception(f"Output dataset {request.output_dataset_name} is already being generated or has been requested. Please choose a different name.")
|
| 162 |
+
except Exception as e:
|
| 163 |
+
# If it's our custom exception about dataset already existing, re-raise it
|
| 164 |
+
if "already being generated" in str(e):
|
| 165 |
+
raise e
|
| 166 |
+
# Otherwise, ignore database connection errors and continue
|
| 167 |
+
pass
|
| 168 |
|
| 169 |
# check the models exists
|
| 170 |
try:
|
|
|
|
| 191 |
if request.top_p < 0.0 or request.top_p > 1.0:
|
| 192 |
raise Exception("Top P must be between 0.0 and 1.0")
|
| 193 |
|
| 194 |
+
# check valid email address TODO: could use py3-validate-email https://stackoverflow.com/questions/8022530/how-to-check-for-valid-email-address
|
| 195 |
if "@" not in request.email or "." not in request.email.split("@")[-1]:
|
| 196 |
raise Exception("Invalid email address")
|
| 197 |
|
|
|
|
| 243 |
|
| 244 |
|
| 245 |
|
| 246 |
+
def get_generation_stats_safe():
|
| 247 |
+
"""Safely fetch generation request statistics with proper error handling"""
|
| 248 |
+
try:
|
| 249 |
+
url = os.getenv("SUPABASE_URL")
|
| 250 |
+
key = os.getenv("SUPABASE_KEY")
|
| 251 |
+
|
| 252 |
+
if not url or not key:
|
| 253 |
+
raise Exception("Missing SUPABASE_URL or SUPABASE_KEY environment variables")
|
| 254 |
+
|
| 255 |
+
supabase = create_client(
|
| 256 |
+
url,
|
| 257 |
+
key,
|
| 258 |
+
options=ClientOptions(
|
| 259 |
+
postgrest_client_timeout=10,
|
| 260 |
+
storage_client_timeout=10,
|
| 261 |
+
schema="public",
|
| 262 |
+
)
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
# Fetch data excluding sensitive token fields
|
| 266 |
+
response = supabase.table("gen-requests").select(
|
| 267 |
+
"id, created_at, status, input_dataset_name, input_dataset_config, "
|
| 268 |
+
"input_dataset_split, output_dataset_name, prompt_column, "
|
| 269 |
+
"model_name_or_path, model_revision, max_tokens, temperature, "
|
| 270 |
+
"top_k, top_p, username, num_output_examples, private"
|
| 271 |
+
).order("created_at", desc=True).limit(50).execute()
|
| 272 |
+
|
| 273 |
+
return {"status": "success", "data": response.data}
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
return {"status": "error", "message": str(e), "data": []}
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# Old commented code removed - replaced with DatabaseManager and get_generation_stats_safe()
|
| 280 |
|
| 281 |
|
| 282 |
def main():
|
| 283 |
with gr.Blocks(title="Synthetic Data Generation") as demo:
|
| 284 |
+
gr.HTML("<h3 style='text-align:center'>Generate synthetic data with AI models. Free to use! Sign up for PRO benefits (10k samples vs 100). <a href='http://huggingface.co/subscribe/pro?source=synthetic-data-universe' target='_blank'>Upgrade to PRO</a></h3>", elem_id="sub_title")
|
| 285 |
+
|
| 286 |
+
# Add sign-in button at the top
|
| 287 |
+
with gr.Row():
|
| 288 |
+
gr.Markdown("") # Empty space for alignment
|
| 289 |
+
login_button = gr.LoginButton(value="π Sign in", size="sm")
|
| 290 |
+
gr.Markdown("") # Empty space for alignment
|
| 291 |
|
| 292 |
+
signin_message = gr.Markdown("## π Sign In Required\n\nPlease sign in with your Hugging Face account to access the synthetic data generation service. Click the **Sign in** button above to continue.", visible=True)
|
| 293 |
main_interface = gr.Column(visible=False)
|
| 294 |
+
|
| 295 |
+
# Store the current oauth token for use in submit_request
|
| 296 |
+
current_oauth_token = gr.State(None)
|
| 297 |
+
|
| 298 |
with main_interface:
|
| 299 |
with gr.Group():
|
| 300 |
with gr.Row():
|
|
|
|
| 304 |
Welcome to the Synthetic Data Generation service! This tool allows you to generate synthetic data using large language models. Generation is FREE for Hugging Face PRO users and uses idle GPUs on the HF science cluster.\n
|
| 305 |
Outputs from this service will be PUBLIC and available on the Hugging Face Hub under the organization [synthetic-data-universe](https://huggingface.co/synthetic-data-universe).\n
|
| 306 |
""")
|
| 307 |
+
with gr.Accordion("More Information", open=False):
|
| 308 |
with gr.Row():
|
| 309 |
gr.Markdown("""
|
| 310 |
**How it works:**
|
|
|
|
| 317 |
|
| 318 |
**Requirements:**
|
| 319 |
- Input dataset must be publicly accessible
|
| 320 |
+
- Model must be publicly accessible (and not gated)
|
| 321 |
- Maximum 10,000 samples per dataset
|
| 322 |
+
- Maximum of 8192 generated tokens
|
| 323 |
""")
|
| 324 |
|
| 325 |
+
with gr.Tabs():
|
| 326 |
+
with gr.TabItem("Generate Synthetic Data"):
|
| 327 |
+
with gr.Group():
|
| 328 |
+
gr.Markdown("## Model information")
|
| 329 |
+
with gr.Column():
|
| 330 |
+
with gr.Row():
|
| 331 |
+
model_name_or_path = gr.Dropdown(
|
| 332 |
+
choices=SUPPORTED_MODELS,
|
| 333 |
+
label="Select Model",
|
| 334 |
+
value="Qwen/Qwen3-4B-Instruct-2507",
|
| 335 |
+
info="Choose from popular instruction-tuned models under 40B parameters"
|
| 336 |
+
)
|
| 337 |
+
# model_token = gr.Textbox(label="Model Token (Optional)", type="password", placeholder="Your HF token with read/write access to the model...")
|
| 338 |
+
with gr.Group():
|
| 339 |
+
gr.Markdown("## Dataset information")
|
| 340 |
+
# Dynamic user limit info - default to anonymous user
|
| 341 |
+
user_limit_info = gr.Markdown(value="π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples).", visible=True)
|
| 342 |
+
with gr.Row():
|
| 343 |
+
with gr.Column():
|
| 344 |
+
input_dataset_name = gr.Textbox(label="Input Dataset Name", placeholder="e.g., simplescaling/s1K-1.1")
|
| 345 |
+
prompt_column = gr.Textbox(label="Prompt Column", placeholder="e.g., text, prompt, question", value=None, interactive=False, info="Click Load Info to populate")
|
| 346 |
+
|
| 347 |
+
with gr.Column():
|
| 348 |
+
output_dataset_name = gr.Textbox(label="Output Dataset Name", placeholder="e.g., my-generated-dataset, must be unique. Will be created under the org 'synthetic-data-universe'", value=None, interactive=False, info="Click Load Info to populate")
|
| 349 |
|
|
|
|
| 350 |
with gr.Row():
|
| 351 |
+
with gr.Column():
|
| 352 |
+
input_dataset_config = gr.Dropdown(label="Dataset Config", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 353 |
+
prompt_column = gr.Dropdown(label="Prompt Column", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 354 |
+
|
| 355 |
+
with gr.Column():
|
| 356 |
+
input_dataset_split = gr.Dropdown(label="Dataset Split", choices=[], value=None, interactive=False, info="Click Load Info to populate")
|
| 357 |
+
num_output_samples = gr.Slider(label="Number of samples, leave as '0' for all", value=0, minimum=0, maximum=MAX_SAMPLES_FREE, step=1, interactive=False, info="Click Load Info to populate")
|
| 358 |
+
|
| 359 |
+
gr.Markdown("### Generation Parameters")
|
| 360 |
with gr.Row():
|
| 361 |
+
with gr.Column():
|
| 362 |
+
with gr.Row():
|
| 363 |
+
max_tokens = gr.Slider(label="Max Tokens", value=1024, minimum=256, maximum=MAX_TOKENS, step=256)
|
| 364 |
+
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.1)
|
| 365 |
+
with gr.Row():
|
| 366 |
+
top_k = gr.Slider(label="Top K", value=50, minimum=5, maximum=100, step=5)
|
| 367 |
+
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
|
| 368 |
+
with gr.Row():
|
| 369 |
+
system_prompt = gr.Textbox(label="System Prompt (Optional)", lines=3, placeholder="Optional system prompt... e.g., You are a helpful assistant.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
+
with gr.Group():
|
| 372 |
+
gr.Markdown("## User Information, for notification when your job is completed (still TODO)")
|
|
|
|
|
|
|
| 373 |
with gr.Row():
|
| 374 |
+
with gr.Column():
|
| 375 |
+
with gr.Row():
|
| 376 |
+
email = gr.Textbox(label="Email", placeholder="[email protected]")
|
| 377 |
+
# with gr.Row():
|
| 378 |
+
# input_dataset_token = gr.Textbox(label="Input dataset token", type="password", placeholder="Your HF token with read access to the input dataset, leave blank if public dataset")
|
| 379 |
+
# output_dataset_token = gr.Textbox(label="Output dataset token", type="password", placeholder="Your HF token with write access to the output dataset")
|
| 380 |
+
|
| 381 |
+
submit_btn = gr.Button("Submit Generation Request", variant="primary")
|
| 382 |
+
output_status = gr.Textbox(label="Status", interactive=False)
|
| 383 |
+
|
| 384 |
+
with gr.TabItem("Statistics Dashboard"):
|
| 385 |
+
gr.Markdown("## Generation Requests Statistics")
|
| 386 |
+
gr.Markdown("π View recent synthetic data generation requests and their status.")
|
| 387 |
+
|
| 388 |
+
with gr.Row():
|
| 389 |
+
refresh_stats_btn = gr.Button("π Refresh Statistics", size="sm", variant="secondary")
|
| 390 |
+
clear_stats_btn = gr.Button("ποΈ Clear Display", size="sm")
|
| 391 |
+
|
| 392 |
+
stats_status = gr.Markdown("Click 'Refresh Statistics' to load recent generation requests.", visible=True)
|
| 393 |
+
|
| 394 |
+
stats_dataframe = gr.Dataframe(
|
| 395 |
+
headers=["ID", "Created", "Status", "Input Dataset", "Output Dataset", "Model", "Samples", "User"],
|
| 396 |
+
datatype=["str", "str", "str", "str", "str", "str", "number", "str"],
|
| 397 |
+
interactive=False,
|
| 398 |
+
wrap=True,
|
| 399 |
+
value=[],
|
| 400 |
+
label="Recent Generation Requests (Last 50)",
|
| 401 |
+
visible=False
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
def load_statistics():
|
| 405 |
+
"""Load and format statistics data"""
|
| 406 |
+
try:
|
| 407 |
+
# Use the new safe database function
|
| 408 |
+
result = get_generation_stats_safe()
|
| 409 |
+
|
| 410 |
+
if result["status"] == "error":
|
| 411 |
+
return (
|
| 412 |
+
f"β **Error loading statistics**: {result['message']}",
|
| 413 |
+
gr.update(visible=False),
|
| 414 |
+
gr.update(visible=True)
|
| 415 |
+
)
|
| 416 |
+
|
| 417 |
+
data = result["data"]
|
| 418 |
+
if not data:
|
| 419 |
+
return (
|
| 420 |
+
"π **No data found**: The database appears to be empty or the table doesn't exist yet.",
|
| 421 |
+
gr.update(visible=False),
|
| 422 |
+
gr.update(visible=True)
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
# Format data for display
|
| 426 |
+
formatted_data = []
|
| 427 |
+
for item in data:
|
| 428 |
+
# Format timestamp
|
| 429 |
+
created_at = item.get('created_at', 'Unknown')
|
| 430 |
+
if created_at and created_at != 'Unknown':
|
| 431 |
+
try:
|
| 432 |
+
from datetime import datetime
|
| 433 |
+
dt = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
| 434 |
+
created_at = dt.strftime('%Y-%m-%d %H:%M')
|
| 435 |
+
except:
|
| 436 |
+
pass
|
| 437 |
+
|
| 438 |
+
formatted_data.append([
|
| 439 |
+
str(item.get('id', ''))[:8] + "..." if len(str(item.get('id', ''))) > 8 else str(item.get('id', '')),
|
| 440 |
+
created_at,
|
| 441 |
+
item.get('status', 'Unknown'),
|
| 442 |
+
(item.get('input_dataset_name', '')[:30] + "...") if len(item.get('input_dataset_name', '')) > 30 else item.get('input_dataset_name', ''),
|
| 443 |
+
(item.get('output_dataset_name', '')[:30] + "...") if len(item.get('output_dataset_name', '')) > 30 else item.get('output_dataset_name', ''),
|
| 444 |
+
(item.get('model_name_or_path', '')[:25] + "...") if len(item.get('model_name_or_path', '')) > 25 else item.get('model_name_or_path', ''),
|
| 445 |
+
item.get('num_output_examples', 0),
|
| 446 |
+
item.get('username', 'Anonymous')
|
| 447 |
+
])
|
| 448 |
+
|
| 449 |
+
return (
|
| 450 |
+
f"β
**Statistics loaded successfully**: Found {len(formatted_data)} recent requests.",
|
| 451 |
+
gr.update(value=formatted_data, visible=True),
|
| 452 |
+
gr.update(visible=True)
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
except Exception as e:
|
| 456 |
+
return (
|
| 457 |
+
f"β **Unexpected error**: {str(e)}",
|
| 458 |
+
gr.update(visible=False),
|
| 459 |
+
gr.update(visible=True)
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
def clear_statistics():
|
| 463 |
+
"""Clear the statistics display"""
|
| 464 |
+
return (
|
| 465 |
+
"Click 'Refresh Statistics' to load recent generation requests.",
|
| 466 |
+
gr.update(value=[], visible=False),
|
| 467 |
+
gr.update(visible=True)
|
| 468 |
+
)
|
| 469 |
+
|
| 470 |
+
# Connect buttons to functions
|
| 471 |
+
refresh_stats_btn.click(
|
| 472 |
+
load_statistics,
|
| 473 |
+
outputs=[stats_status, stats_dataframe, stats_status]
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
clear_stats_btn.click(
|
| 477 |
+
clear_statistics,
|
| 478 |
+
outputs=[stats_status, stats_dataframe, stats_status]
|
| 479 |
+
)
|
| 480 |
|
| 481 |
+
def submit_request(input_dataset_name, input_split, input_dataset_config, output_dataset_name, prompt_col, model_name, sys_prompt,
|
| 482 |
+
max_tok, temp, top_k_val, top_p_val, email_addr, num_output_samples, oauth_token=None):
|
| 483 |
|
| 484 |
MASTER_ORG = "synthetic-data-universe/"
|
| 485 |
model_token = False # This is currently not supported
|
|
|
|
| 497 |
output_dataset_name=MASTER_ORG + output_dataset_name,
|
| 498 |
prompt_column=prompt_col,
|
| 499 |
model_name_or_path=model_name,
|
| 500 |
+
model_revision="main",
|
| 501 |
model_token=model_token,
|
| 502 |
system_prompt=sys_prompt if sys_prompt else None,
|
| 503 |
max_tokens=int(max_tok),
|
|
|
|
| 512 |
)
|
| 513 |
|
| 514 |
# check the input dataset exists and can be accessed with the provided token
|
| 515 |
+
request = validate_request(request, oauth_token)
|
| 516 |
add_request_to_db(request)
|
| 517 |
|
| 518 |
return "Request submitted successfully!"
|
|
|
|
| 522 |
submit_btn.click(
|
| 523 |
submit_request,
|
| 524 |
inputs=[input_dataset_name, input_dataset_split, input_dataset_config, output_dataset_name, prompt_column, model_name_or_path,
|
| 525 |
+
system_prompt, max_tokens, temperature, top_k, top_p, email, num_output_samples, current_oauth_token],
|
| 526 |
outputs=output_status
|
| 527 |
)
|
| 528 |
|
| 529 |
+
def update_user_limits(oauth_token):
|
| 530 |
+
if oauth_token is None:
|
| 531 |
+
return "π€ **Anonymous User**: You can generate up to 100 samples per request. Use the sign-in button above for PRO benefits (10,000 samples)."
|
| 532 |
+
|
| 533 |
+
is_pro = verify_pro_status(oauth_token)
|
| 534 |
+
if is_pro:
|
| 535 |
+
return "β¨ **PRO User**: You can generate up to 10,000 samples per request."
|
| 536 |
+
else:
|
| 537 |
+
return "π€ **Free User**: You can generate up to 100 samples per request. [Upgrade to PRO](http://huggingface.co/subscribe/pro?source=synthetic-data-universe) for 10,000 samples."
|
| 538 |
+
|
| 539 |
def control_access(profile: Optional[gr.OAuthProfile] = None, oauth_token: Optional[gr.OAuthToken] = None):
|
| 540 |
+
# Require users to be signed in
|
| 541 |
+
if oauth_token is None:
|
| 542 |
+
# User is not signed in - show sign-in prompt, hide main interface
|
| 543 |
+
return (
|
| 544 |
+
gr.update(visible=False), # main_interface
|
| 545 |
+
gr.update(visible=True), # signin_message
|
| 546 |
+
oauth_token, # current_oauth_token
|
| 547 |
+
"", # user_limit_info (empty when not signed in)
|
| 548 |
+
gr.update(), # num_output_samples (no change)
|
| 549 |
+
gr.update(value="π Sign in") # login_button
|
| 550 |
+
)
|
| 551 |
else:
|
| 552 |
+
# User is signed in - show main interface, hide sign-in prompt
|
| 553 |
+
limit_msg = update_user_limits(oauth_token)
|
| 554 |
+
is_pro = verify_pro_status(oauth_token)
|
| 555 |
+
max_samples = MAX_SAMPLES_PRO if is_pro else MAX_SAMPLES_FREE
|
| 556 |
+
|
| 557 |
+
if is_pro:
|
| 558 |
+
button_text = f"β¨ Signed in as PRO ({profile.name if profile else 'User'})"
|
| 559 |
+
else:
|
| 560 |
+
button_text = f"π€ Signed in as {profile.name if profile else 'User'}"
|
| 561 |
+
|
| 562 |
+
return (
|
| 563 |
+
gr.update(visible=True), # main_interface
|
| 564 |
+
gr.update(visible=False), # signin_message
|
| 565 |
+
oauth_token, # current_oauth_token
|
| 566 |
+
limit_msg, # user_limit_info
|
| 567 |
+
gr.update(maximum=max_samples), # num_output_samples
|
| 568 |
+
gr.update(value=button_text) # login_button
|
| 569 |
)
|
|
|
|
| 570 |
|
|
|
|
| 571 |
|
| 572 |
+
# Handle login state changes - LoginButton automatically handles auth state changes
|
| 573 |
+
# The demo.load will handle both initial load and auth changes
|
| 574 |
+
|
| 575 |
+
demo.load(control_access, inputs=None, outputs=[main_interface, signin_message, current_oauth_token, user_limit_info, num_output_samples, login_button])
|
| 576 |
demo.queue(max_size=None, default_concurrency_limit=None).launch(show_error=True)
|
| 577 |
|
| 578 |
if __name__ == "__main__":
|