| from transformers import AutoTokenizer, AutoModelForMaskedLM | |
| from transformers import pipeline | |
| import random | |
| from nltk.corpus import stopwords | |
| # Masking Model | |
| def mask_non_stopword(sentence): | |
| stop_words = set(stopwords.words('english')) | |
| words = sentence.split() | |
| non_stop_words = [word for word in words if word.lower() not in stop_words] | |
| if not non_stop_words: | |
| return sentence | |
| word_to_mask = random.choice(non_stop_words) | |
| masked_sentence = sentence.replace(word_to_mask, '[MASK]', 1) | |
| return masked_sentence | |
| # Load tokenizer and model for masked language model | |
| tokenizer = AutoTokenizer.from_pretrained("bert-large-cased-whole-word-masking") | |
| model = AutoModelForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") | |
| fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer) | |
| def mask(sentence): | |
| predictions = fill_mask(sentence) | |
| masked_sentences = [predictions[i]['sequence'] for i in range(len(predictions))] | |
| return masked_sentences |