| import re | |
| from nltk.corpus import stopwords | |
| def find_common_subsequences(sentence, str_list): | |
| stop_words = set(stopwords.words('english')) | |
| sentence = sentence.lower() | |
| str_list = [s.lower() for s in str_list] | |
| def is_present(subseq, str_list): | |
| subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') | |
| return all(subseq_regex.search(s) for s in str_list) | |
| def remove_stop_words_and_special_chars(sentence): | |
| sentence = re.sub(r'[^\w\s]', '', sentence) | |
| words = sentence.split() | |
| filtered_words = [word for word in words if word.lower() not in stop_words] | |
| return " ".join(filtered_words) | |
| cleaned_sentence = remove_stop_words_and_special_chars(sentence) | |
| cleaned_str_list = [remove_stop_words_and_special_chars(s) for s in str_list] | |
| words = cleaned_sentence.split() | |
| common_grams = [] | |
| added_phrases = set() | |
| for n in range(5, 0, -1): # Check n-grams from size 5 to 1 | |
| for i in range(len(words) - n + 1): | |
| subseq = " ".join(words[i:i + n]) | |
| if is_present(subseq, cleaned_str_list) and not any(subseq in phrase for phrase in added_phrases): | |
| common_grams.append((i, subseq)) | |
| added_phrases.add(subseq) | |
| # Sort by the first appearance in the original sentence | |
| common_grams.sort(key=lambda x: x[0]) | |
| # Assign indices based on the sorted order | |
| indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] | |
| return indexed_common_grams | |
| def find_common_gram_positions(str_list, common_grams): | |
| # Initialize a list to hold positions for each sentence | |
| positions = [] | |
| for sentence in str_list: | |
| # Number each word in the sentence | |
| words = re.sub(r'[^\w\s]', '', sentence).lower().split() | |
| word_positions = {word: [] for word in words} | |
| for idx, word in enumerate(words): | |
| word_positions[word].append(idx + 1) # Store 1-based index positions | |
| # Create a list to store positions of common grams for the current sentence | |
| sentence_positions = [] | |
| for gram in common_grams: | |
| # Clean the gram for matching | |
| cleaned_gram = re.sub(r'[^\w\s]', '', gram).lower() | |
| gram_words = cleaned_gram.split() | |
| # Check for the position of the common gram in the current sentence | |
| if all(word in word_positions for word in gram_words): | |
| # Get the position of the first word of the common gram | |
| start_idx = word_positions[gram_words[0]][0] | |
| sentence_positions.append(start_idx) | |
| else: | |
| sentence_positions.append(-1) # Common gram not found | |
| # Append the positions for the current sentence to the main positions list | |
| positions.append(sentence_positions) | |
| return positions | |
| # # Example usage | |
| # sentence = "Donald Trump said at a campaign rally event in Wilkes-Barre, Pennsylvania, that there has “never been a more dangerous time since the Holocaust” to be Jewish in the United States." | |
| # str_list = [ | |
| # 'During a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump stated that being Jewish in the United States has never been more hazardous since the Holocaust.', | |
| # 'At a campaign rally in Wilkes-Barre, Pennsylvania, Donald Trump declared that being Jewish in the United States has never been more hazardous since the Holocaust.', | |
| # 'Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania, and stated that being Jewish in the United States has never been more perilous since the Holocaust.', | |
| # 'Donald Trump made the statement at a campaign rally in Wilkes-Barre, Pennsylvania, saying that being Jewish in the United States has never been more dangerous since the Holocaust.', | |
| # 'Last month, Donald Trump spoke at a campaign rally in Wilkes-Barre, Pennsylvania and stated that being Jewish in the United States has never been more hazardous than during World War II.', | |
| # 'In Wilkes-Barre, Pennsylvania, Donald Trump spoke at a campaign rally and claimed that the Holocaust was always more hazardous for Jews in the United States.', | |
| # 'A campaign rally in Wilkes-Barre, Pennsylvania saw Donald Trump declare that being Jewish in the United States has never been more perilous since WWII.', | |
| # 'Speaking at a campaign rally in Wilkes-Barre, Pennsylvania today, Donald Trump declared that being Jewish has never been more hazardous in the United States since the Holocaust.', | |
| # 'During his campaign rally in Wilkes-Barre, Pennsylvania today Donald Trump stated: "There has never been a safer place for being Jewish in the United States since the Holocaust."', | |
| # 'At a campaign rally in Wilkes-Barre, Pennsylvania (pictured), Donald Trump said, "There has never been... gotten worse for being Jewish in America since the Holocaust."' | |
| # ] | |
| # # Find common subsequences | |
| # common_grams = find_common_subsequences(sentence, str_list) | |
| # # Extract the subsequences from the common grams for position checking | |
| # subsequences = [subseq for _, subseq in common_grams] | |
| # # Find positions of the common grams | |
| # common_gram_positions = find_common_gram_positions(str_list, subsequences) | |
| # print(common_grams) | |