Spaces:
Runtime error
Runtime error
Commit
·
0e893b5
1
Parent(s):
eef299f
Update models.py
Browse files
models.py
CHANGED
|
@@ -131,10 +131,10 @@ class OpenAIModelGPT3_5(BaseTCOModel):
|
|
| 131 |
|
| 132 |
return cost_per_input_token, cost_per_output_token, labor
|
| 133 |
|
| 134 |
-
class
|
| 135 |
|
| 136 |
def __init__(self):
|
| 137 |
-
self.set_name("(
|
| 138 |
self.set_latency("27s")
|
| 139 |
super().__init__()
|
| 140 |
|
|
@@ -144,7 +144,7 @@ class OpenSourceLlama2Model(BaseTCOModel):
|
|
| 144 |
input_tokens_cost_per_token = 0.00052
|
| 145 |
r = maxed_out / 100
|
| 146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
| 147 |
-
|
| 148 |
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""")
|
| 149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
| 150 |
interactive=False,
|
|
@@ -176,7 +176,7 @@ class OpenSourceLlama2Model(BaseTCOModel):
|
|
| 176 |
)
|
| 177 |
self.maxed_out.change(on_maxed_out_change, inputs=[self.maxed_out, self.input_tokens_cost_per_token, self.output_tokens_cost_per_token], outputs=[self.input_tokens_cost_per_token, self.output_tokens_cost_per_token])
|
| 178 |
|
| 179 |
-
self.labor = gr.Number(
|
| 180 |
label="($) Labor cost per month",
|
| 181 |
info="This is an estimate of the labor cost of the AI engineer in charge of deploying the model",
|
| 182 |
interactive=True
|
|
@@ -266,16 +266,15 @@ class ModelPage:
|
|
| 266 |
|
| 267 |
def compute_cost_per_token(self, *args):
|
| 268 |
begin=0
|
| 269 |
-
current_model = args[-3]
|
| 270 |
current_input_tokens = args[-2]
|
| 271 |
current_output_tokens = args[-1]
|
| 272 |
for model in self.models:
|
| 273 |
model_n_args = len(model.get_components_for_cost_computing())
|
| 274 |
if current_model == model.get_name():
|
| 275 |
-
|
| 276 |
model_args = args[begin:begin+model_n_args]
|
| 277 |
cost_per_input_token, cost_per_output_token, labor_cost = model.compute_cost_per_token(*model_args)
|
| 278 |
-
model_tco = cost_per_input_token * current_input_tokens + cost_per_output_token * current_output_tokens
|
| 279 |
latency = model.get_latency()
|
| 280 |
|
| 281 |
return model_tco, latency, labor_cost
|
|
|
|
| 131 |
|
| 132 |
return cost_per_input_token, cost_per_output_token, labor
|
| 133 |
|
| 134 |
+
class DIYLlama2Model(BaseTCOModel):
|
| 135 |
|
| 136 |
def __init__(self):
|
| 137 |
+
self.set_name("(Deploy yourself) Llama 2 70B")
|
| 138 |
self.set_latency("27s")
|
| 139 |
super().__init__()
|
| 140 |
|
|
|
|
| 144 |
input_tokens_cost_per_token = 0.00052
|
| 145 |
r = maxed_out / 100
|
| 146 |
return input_tokens_cost_per_token * 0.65 / r, output_tokens_cost_per_token * 0.65/ r
|
| 147 |
+
|
| 148 |
self.source = gr.Markdown("""<span style="font-size: 16px; font-weight: 600; color: #212529;">Source</span>""")
|
| 149 |
self.info = gr.Markdown("The cost per input and output tokens values below are from [these benchmark results](https://www.cursor.so/blog/llama-inference#user-content-fn-llama-paper) that were obtained using the following initial configurations.",
|
| 150 |
interactive=False,
|
|
|
|
| 176 |
)
|
| 177 |
self.maxed_out.change(on_maxed_out_change, inputs=[self.maxed_out, self.input_tokens_cost_per_token, self.output_tokens_cost_per_token], outputs=[self.input_tokens_cost_per_token, self.output_tokens_cost_per_token])
|
| 178 |
|
| 179 |
+
self.labor = gr.Number(5000, visible=False,
|
| 180 |
label="($) Labor cost per month",
|
| 181 |
info="This is an estimate of the labor cost of the AI engineer in charge of deploying the model",
|
| 182 |
interactive=True
|
|
|
|
| 266 |
|
| 267 |
def compute_cost_per_token(self, *args):
|
| 268 |
begin=0
|
| 269 |
+
current_model = args[-3]
|
| 270 |
current_input_tokens = args[-2]
|
| 271 |
current_output_tokens = args[-1]
|
| 272 |
for model in self.models:
|
| 273 |
model_n_args = len(model.get_components_for_cost_computing())
|
| 274 |
if current_model == model.get_name():
|
|
|
|
| 275 |
model_args = args[begin:begin+model_n_args]
|
| 276 |
cost_per_input_token, cost_per_output_token, labor_cost = model.compute_cost_per_token(*model_args)
|
| 277 |
+
model_tco = cost_per_input_token * current_input_tokens.value + cost_per_output_token * current_output_tokens.value
|
| 278 |
latency = model.get_latency()
|
| 279 |
|
| 280 |
return model_tco, latency, labor_cost
|