Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
54434cf
1
Parent(s):
9608f9f
add description
Browse files- src/components.py +173 -102
src/components.py
CHANGED
|
@@ -65,16 +65,12 @@ def form(df):
|
|
| 65 |
pre_steps = st.multiselect(
|
| 66 |
"Select pre-lemmatization processing steps (ordered)",
|
| 67 |
options=steps_options,
|
| 68 |
-
default=[
|
| 69 |
-
steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value
|
| 70 |
-
],
|
| 71 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 72 |
help="Select the processing steps to apply before the text is lemmatized",
|
| 73 |
)
|
| 74 |
|
| 75 |
-
lammatization_options = list(
|
| 76 |
-
PreprocessingPipeline.lemmatization_component().keys()
|
| 77 |
-
)
|
| 78 |
lemmatization_step = st.selectbox(
|
| 79 |
"Select lemmatization",
|
| 80 |
options=lammatization_options,
|
|
@@ -85,10 +81,7 @@ def form(df):
|
|
| 85 |
post_steps = st.multiselect(
|
| 86 |
"Select post-lemmatization processing steps (ordered)",
|
| 87 |
options=steps_options,
|
| 88 |
-
default=[
|
| 89 |
-
steps_options[i]
|
| 90 |
-
for i in PreprocessingConfigs.DEFAULT_POST.value
|
| 91 |
-
],
|
| 92 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 93 |
help="Select the processing steps to apply after the text is lemmatized",
|
| 94 |
)
|
|
@@ -100,31 +93,21 @@ def form(df):
|
|
| 100 |
start_time = time.time()
|
| 101 |
|
| 102 |
# warnings about inputs
|
| 103 |
-
language_specific_warnings(
|
| 104 |
-
pre_steps, post_steps, lemmatization_step, language
|
| 105 |
-
)
|
| 106 |
|
| 107 |
# preprocess
|
| 108 |
if not disable_preprocessing:
|
| 109 |
with st.spinner("Step 1/4: Preprocessing text"):
|
| 110 |
-
pipe = PreprocessingPipeline(
|
| 111 |
-
language, pre_steps, lemmatization_step, post_steps
|
| 112 |
-
)
|
| 113 |
df = pipe.vaex_process(df, text_column)
|
| 114 |
else:
|
| 115 |
-
with st.spinner(
|
| 116 |
-
|
| 117 |
-
):
|
| 118 |
-
df = df.rename(
|
| 119 |
-
columns={text_column: ColumnNames.PROCESSED_TEXT.value}
|
| 120 |
-
)
|
| 121 |
time.sleep(1.2)
|
| 122 |
|
| 123 |
# prepare input
|
| 124 |
with st.spinner("Step 2/4: Preparing inputs"):
|
| 125 |
-
input_dict = input_transform(
|
| 126 |
-
df[ColumnNames.PROCESSED_TEXT.value], df[label_column]
|
| 127 |
-
)
|
| 128 |
|
| 129 |
# wordify
|
| 130 |
with st.spinner("Step 3/4: Wordifying"):
|
|
@@ -146,6 +129,168 @@ def form(df):
|
|
| 146 |
return new_df, meta_data
|
| 147 |
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def faq():
|
| 150 |
st.subheader("Frequently Asked Questions")
|
| 151 |
with st.expander("What is Wordify?"):
|
|
@@ -249,75 +394,6 @@ def faq():
|
|
| 249 |
st.markdown(contacts(), unsafe_allow_html=True)
|
| 250 |
|
| 251 |
|
| 252 |
-
def presentation():
|
| 253 |
-
st.markdown(
|
| 254 |
-
"""
|
| 255 |
-
Wordify makes it easy to identify words that discriminate categories in textual data.
|
| 256 |
-
|
| 257 |
-
:point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
|
| 258 |
-
show an interactive UI*.
|
| 259 |
-
"""
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
st.subheader("Quickstart")
|
| 263 |
-
st.markdown(
|
| 264 |
-
"""
|
| 265 |
-
- There is no need to preprocess your text, we will take care of it. However, if you wish to
|
| 266 |
-
do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
|
| 267 |
-
|
| 268 |
-
- We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
|
| 269 |
-
you provide a file following this naming convention, Wordify will automatically select the
|
| 270 |
-
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
| 271 |
-
provide the column names in the interactive UI.
|
| 272 |
-
|
| 273 |
-
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
| 274 |
-
a new Wordify session is created and your progress is lost.
|
| 275 |
-
|
| 276 |
-
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
| 277 |
-
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
| 278 |
-
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
| 279 |
-
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
| 280 |
-
If this is not enough, please do feel free to reach out to us directly so we can help.
|
| 281 |
-
"""
|
| 282 |
-
)
|
| 283 |
-
|
| 284 |
-
st.subheader("Input format")
|
| 285 |
-
st.markdown(
|
| 286 |
-
"""
|
| 287 |
-
Please note that your file must have a column with the texts and a column with the labels,
|
| 288 |
-
for example
|
| 289 |
-
"""
|
| 290 |
-
)
|
| 291 |
-
st.table(
|
| 292 |
-
{
|
| 293 |
-
"text": ["A review", "Another review", "Yet another one", "etc"],
|
| 294 |
-
"label": ["Good", "Bad", "Good", "etc"],
|
| 295 |
-
}
|
| 296 |
-
)
|
| 297 |
-
|
| 298 |
-
st.subheader("Output format")
|
| 299 |
-
st.markdown(
|
| 300 |
-
"""
|
| 301 |
-
As a result of the process, you will get a file containing 4 columns:
|
| 302 |
-
- `Word`: the n-gram (i.e., a word or a concatenation of words) considered
|
| 303 |
-
- `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
|
| 304 |
-
- `Label`: the label that `Word` is discriminating
|
| 305 |
-
- `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
|
| 306 |
-
|
| 307 |
-
for example
|
| 308 |
-
"""
|
| 309 |
-
)
|
| 310 |
-
|
| 311 |
-
st.table(
|
| 312 |
-
{
|
| 313 |
-
"Word": ["good", "awful", "bad service", "etc"],
|
| 314 |
-
"Score": ["0.52", "0.49", "0.35", "etc"],
|
| 315 |
-
"Label": ["Good", "Bad", "Good", "etc"],
|
| 316 |
-
"Correlation": ["positive", "positive", "negative", "etc"],
|
| 317 |
-
}
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
-
|
| 321 |
def footer():
|
| 322 |
st.sidebar.markdown(
|
| 323 |
"""
|
|
@@ -383,15 +459,11 @@ def analysis(outputs):
|
|
| 383 |
)
|
| 384 |
|
| 385 |
with st.expander("Vocabulary"):
|
| 386 |
-
st.markdown(
|
| 387 |
-
"The table below shows all candidate n-grams that Wordify considered"
|
| 388 |
-
)
|
| 389 |
st.write(meta_data["vocabulary"])
|
| 390 |
|
| 391 |
with st.expander("Labels"):
|
| 392 |
-
st.markdown(
|
| 393 |
-
"The table below summarizes the labels that your file contained"
|
| 394 |
-
)
|
| 395 |
st.write(meta_data["labels"])
|
| 396 |
|
| 397 |
return subset_df
|
|
@@ -421,6 +493,5 @@ def language_specific_warnings(pre_steps, post_steps, lemmatization_step, langua
|
|
| 421 |
"Chinese",
|
| 422 |
):
|
| 423 |
st.info(
|
| 424 |
-
msg
|
| 425 |
-
+ " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
| 426 |
)
|
|
|
|
| 65 |
pre_steps = st.multiselect(
|
| 66 |
"Select pre-lemmatization processing steps (ordered)",
|
| 67 |
options=steps_options,
|
| 68 |
+
default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_PRE.value],
|
|
|
|
|
|
|
| 69 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 70 |
help="Select the processing steps to apply before the text is lemmatized",
|
| 71 |
)
|
| 72 |
|
| 73 |
+
lammatization_options = list(PreprocessingPipeline.lemmatization_component().keys())
|
|
|
|
|
|
|
| 74 |
lemmatization_step = st.selectbox(
|
| 75 |
"Select lemmatization",
|
| 76 |
options=lammatization_options,
|
|
|
|
| 81 |
post_steps = st.multiselect(
|
| 82 |
"Select post-lemmatization processing steps (ordered)",
|
| 83 |
options=steps_options,
|
| 84 |
+
default=[steps_options[i] for i in PreprocessingConfigs.DEFAULT_POST.value],
|
|
|
|
|
|
|
|
|
|
| 85 |
format_func=lambda x: x.replace("_", " ").title(),
|
| 86 |
help="Select the processing steps to apply after the text is lemmatized",
|
| 87 |
)
|
|
|
|
| 93 |
start_time = time.time()
|
| 94 |
|
| 95 |
# warnings about inputs
|
| 96 |
+
language_specific_warnings(pre_steps, post_steps, lemmatization_step, language)
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# preprocess
|
| 99 |
if not disable_preprocessing:
|
| 100 |
with st.spinner("Step 1/4: Preprocessing text"):
|
| 101 |
+
pipe = PreprocessingPipeline(language, pre_steps, lemmatization_step, post_steps)
|
|
|
|
|
|
|
| 102 |
df = pipe.vaex_process(df, text_column)
|
| 103 |
else:
|
| 104 |
+
with st.spinner("Step 1/4: Preprocessing has been disabled - doing nothing"):
|
| 105 |
+
df = df.rename(columns={text_column: ColumnNames.PROCESSED_TEXT.value})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
time.sleep(1.2)
|
| 107 |
|
| 108 |
# prepare input
|
| 109 |
with st.spinner("Step 2/4: Preparing inputs"):
|
| 110 |
+
input_dict = input_transform(df[ColumnNames.PROCESSED_TEXT.value], df[label_column])
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# wordify
|
| 113 |
with st.spinner("Step 3/4: Wordifying"):
|
|
|
|
| 129 |
return new_df, meta_data
|
| 130 |
|
| 131 |
|
| 132 |
+
def presentation():
|
| 133 |
+
st.markdown(
|
| 134 |
+
"""
|
| 135 |
+
Wordify makes it easy to identify words that discriminate categories in textual data.
|
| 136 |
+
It was proposed by Dirk Hovy, Shiri Melumad, and Jeffrey J Inman in
|
| 137 |
+
[Wordify: A Tool for Discovering and Differentiating Consumer Vocabularies](https://academic.oup.com/jcr/article/48/3/394/6199426).
|
| 138 |
+
|
| 139 |
+
:point_left: Start by uploading a file. *Once you upload the file, __Wordify__ will
|
| 140 |
+
show an interactive UI*.
|
| 141 |
+
"""
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
st.subheader("Quickstart")
|
| 145 |
+
st.markdown(
|
| 146 |
+
"""
|
| 147 |
+
- There is no need to preprocess your text, we will take care of it. However, if you wish to
|
| 148 |
+
do so, turn off preprocessing in the `Advanced Settings` in the interactive UI.
|
| 149 |
+
|
| 150 |
+
- We expect a file with two columns: `label` with the labels and `text` with the texts (the names are case insensitive). If
|
| 151 |
+
you provide a file following this naming convention, Wordify will automatically select the
|
| 152 |
+
correct columns. However, if you wish to use a different nomenclature, you will be asked to
|
| 153 |
+
provide the column names in the interactive UI.
|
| 154 |
+
|
| 155 |
+
- Maintain a stable connection with the Wordify page until you download the data. If you refresh the page,
|
| 156 |
+
a new Wordify session is created and your progress is lost.
|
| 157 |
+
|
| 158 |
+
- Wordify performances depend on the length of the individual texts in your file. The longer the texts, the higher
|
| 159 |
+
the chance that Wordify considers many n-grams. More n-grams means more data to analyse in each run.
|
| 160 |
+
We tailored Wordify performance for files of approximately 5'000 lines or 50k n-grams. In such cases we expect a runtime
|
| 161 |
+
between 90 seconds and 10 minutes. If your file is big, try to apply a stricter preprocessing of the text in the `Advanced Options` section.
|
| 162 |
+
If this is not enough, please do feel free to reach out to us directly so we can help.
|
| 163 |
+
"""
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
how_to_use()
|
| 167 |
+
how_it_works()
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def how_to_use():
|
| 171 |
+
with st.expander("How to use Wordify"):
|
| 172 |
+
|
| 173 |
+
st.subheader("Input format")
|
| 174 |
+
st.markdown(
|
| 175 |
+
"""
|
| 176 |
+
Please note that your file must have a column with the texts and a column with the labels,
|
| 177 |
+
for example
|
| 178 |
+
"""
|
| 179 |
+
)
|
| 180 |
+
st.table(
|
| 181 |
+
{
|
| 182 |
+
"text": ["A review", "Another review", "Yet another one", "etc"],
|
| 183 |
+
"label": ["Good", "Bad", "Good", "etc"],
|
| 184 |
+
}
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
st.subheader("Output format")
|
| 188 |
+
st.markdown(
|
| 189 |
+
"""
|
| 190 |
+
As a result of the process, you will get a file containing 4 columns:
|
| 191 |
+
- `Word`: the n-gram (i.e., a word or a concatenation of words) considered
|
| 192 |
+
- `Score`: the wordify score, between 0 and 1, of how important is `Word` to discrimitate `Label`
|
| 193 |
+
- `Label`: the label that `Word` is discriminating
|
| 194 |
+
- `Correlation`: how `Word` is correlated with `Label` (e.g., "negative" means that if `Word` is present in the text then the label is less likely to be `Label`)
|
| 195 |
+
|
| 196 |
+
for example
|
| 197 |
+
"""
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
st.table(
|
| 201 |
+
{
|
| 202 |
+
"Word": ["good", "awful", "bad service", "etc"],
|
| 203 |
+
"Score": ["0.52", "0.49", "0.35", "etc"],
|
| 204 |
+
"Label": ["Good", "Bad", "Good", "etc"],
|
| 205 |
+
"Correlation": ["positive", "positive", "negative", "etc"],
|
| 206 |
+
}
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def how_it_works():
|
| 211 |
+
table2 = pd.DataFrame(
|
| 212 |
+
{
|
| 213 |
+
"Text": [
|
| 214 |
+
"Spice light wine",
|
| 215 |
+
"Wine oak heavy",
|
| 216 |
+
"Chardonnay buttery light",
|
| 217 |
+
"Wine light cherry",
|
| 218 |
+
"Chardonnay wine oak buttery",
|
| 219 |
+
],
|
| 220 |
+
"Label": ["Italy", "United States", "United States", "Italy", "United States"],
|
| 221 |
+
}
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
table3 = pd.DataFrame(
|
| 225 |
+
{
|
| 226 |
+
"Model": [1, 2, 3, 4],
|
| 227 |
+
"Buttery": [0.32, 0, 0, 0],
|
| 228 |
+
"Chardonnay": [3.78, 0, 0, 0],
|
| 229 |
+
"Cherry": [-2.49, 0, 0, -6.2],
|
| 230 |
+
"Heavy": [0, 3.62, 0, 0],
|
| 231 |
+
"Light": [-1.72, -4.38, 0, 0],
|
| 232 |
+
"Oak": [0, 0, 0, 0],
|
| 233 |
+
"Spice": [-2.49, 0, -6.2, 0],
|
| 234 |
+
"Wine": [0, 0, 0, 0],
|
| 235 |
+
},
|
| 236 |
+
dtype=str,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
table4 = pd.DataFrame(
|
| 240 |
+
{
|
| 241 |
+
"Coefficient valence": ["positive", "negative"],
|
| 242 |
+
"Buttery": [0.25, 0],
|
| 243 |
+
"Chardonnay": [0.25, 0],
|
| 244 |
+
"Cherry": [0, 0.5],
|
| 245 |
+
"Heavy": [0.25, 0],
|
| 246 |
+
"Light": [0, 0.5],
|
| 247 |
+
"Oak": [0, 0],
|
| 248 |
+
"Spice": [0, 0.5],
|
| 249 |
+
"Wine": [0, 0],
|
| 250 |
+
},
|
| 251 |
+
dtype=str,
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
with st.expander("How Wordify works: an illustrative example"):
|
| 255 |
+
st.markdown(
|
| 256 |
+
f"""
|
| 257 |
+
To provide an intuitive example of how Wordify works, imagine we have the following five documents with hypothetical
|
| 258 |
+
descriptions of wines from the United States and Italy listed in table 2 (preprocessed to remove noise words).
|
| 259 |
+
"""
|
| 260 |
+
)
|
| 261 |
+
st.caption("Table 2: Descriptions of wines from the USA and Italy.")
|
| 262 |
+
st.table(table2)
|
| 263 |
+
|
| 264 |
+
st.markdown(
|
| 265 |
+
"""
|
| 266 |
+
Wordify now draws, say, four independent samples from this data, for example: `(1,3,4,5)`, `(1,2,2,4)`, `(1,1,2,3)`, and `(2,3,4,4)`.
|
| 267 |
+
We fit an L1-regularized Logistic Regression on each, with the United States as target class. This result in the following sparse
|
| 268 |
+
vectors of coefficients reported in table 3 (indicators that are not present in a run are listed as 0 here):
|
| 269 |
+
"""
|
| 270 |
+
)
|
| 271 |
+
st.caption("Table 3: Coefficients for frequency of indicators in each of the four runs for US wines.")
|
| 272 |
+
st.table(table3)
|
| 273 |
+
|
| 274 |
+
st.markdown(
|
| 275 |
+
"""
|
| 276 |
+
We can now count for each indicator how many times out of the four runs it received a non-zero coefficient (the magnitude does not matter).
|
| 277 |
+
We distinguish by positive and negative coefficients, and divide the result by the number of runs (here, four), which yields the final indicators
|
| 278 |
+
that are positively and negatively correlated with the US wines.
|
| 279 |
+
"""
|
| 280 |
+
)
|
| 281 |
+
st.caption("Table 4: Final set of indicators that are positively versus negatively correlated with US wines.")
|
| 282 |
+
st.table(table4)
|
| 283 |
+
st.markdown(
|
| 284 |
+
"""
|
| 285 |
+
The results of table 4 suggest that a wine is likely to be from the United States if its description contains any of the following words: "buttery",
|
| 286 |
+
"chardonnay", or "heavy", and these words are similarly discriminative. In contrast, a wine is likely to not be from the United States if it contains
|
| 287 |
+
the words "spice", "light", or "cherry". It is also worth noting that "oak" and "wine", which were present for both Italian and US wines, were ultimately
|
| 288 |
+
not selected as discriminative indicators of US wines. Finally, we would conduct an analogous analysis with Italy as the target class to determine which
|
| 289 |
+
indicators are most and least discriminative of Italian wines.
|
| 290 |
+
"""
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
|
| 294 |
def faq():
|
| 295 |
st.subheader("Frequently Asked Questions")
|
| 296 |
with st.expander("What is Wordify?"):
|
|
|
|
| 394 |
st.markdown(contacts(), unsafe_allow_html=True)
|
| 395 |
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
def footer():
|
| 398 |
st.sidebar.markdown(
|
| 399 |
"""
|
|
|
|
| 459 |
)
|
| 460 |
|
| 461 |
with st.expander("Vocabulary"):
|
| 462 |
+
st.markdown("The table below shows all candidate n-grams that Wordify considered")
|
|
|
|
|
|
|
| 463 |
st.write(meta_data["vocabulary"])
|
| 464 |
|
| 465 |
with st.expander("Labels"):
|
| 466 |
+
st.markdown("The table below summarizes the labels that your file contained")
|
|
|
|
|
|
|
| 467 |
st.write(meta_data["labels"])
|
| 468 |
|
| 469 |
return subset_df
|
|
|
|
| 493 |
"Chinese",
|
| 494 |
):
|
| 495 |
st.info(
|
| 496 |
+
msg + " However we will still remove stopwords since you selected `Spacy lemmatizer (remove stopwords)`."
|
|
|
|
| 497 |
)
|