Spaces:

asach
/

arxiv-plagiarism-checker-Ilm

Runtime error

App Files Files Community

gamingflexer commited on Jan 29, 2024

Commit

0492a6a

1 Parent(s): 80ed9e0

Fucntion added research

Browse files

Files changed (2) hide show

src/plagiarism/preprocessing.py +17 -0
src/plagiarism/similarity_algos.py +49 -0

src/plagiarism/preprocessing.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+def remove_numbers(words_list: list) -> list:
+    """Remove all numbers from a list of strings."""
+    return [word for word in words_list if not word.isdigit()]
+def remove_stop_words(words_list: list) -> list:
+    """Remove stop words from a list of strings."""
+    stop_words = set(stopwords.words('english'))
+    return [word for word in words_list if word.lower() not in stop_words]
+def lemmatize(words_list: list) -> list:
+    """Lemmatize a list of strings."""
+    lemmatizer = WordNetLemmatizer()
+    return [lemmatizer.lemmatize(word) for word in words_list]

src/plagiarism/similarity_algos.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import difflib
+from preprocessing import remove_numbers, remove_stop_words, lemmatize
+def difflib_overlap(word_token1: list, word_token2: list) -> float:
+    """Get similarity percentage from matching sequences between two strings"""
+    seq = difflib.SequenceMatcher(a=word_token1, b=word_token2)
+    # Return similarity percentage based on difflib library Sequence Matcher
+    return round(seq.ratio() * 100, 3)
+def calculate_overlap(word_token1: list, word_token2: list) -> float:
+    """Get similarity percentage from usage of similar words in two strings"""
+    overlapping_words = []
+    for word in word_token1:
+        if word in word_token2:
+            overlapping_words.append(word)
+    overlap_percentage = len(overlapping_words) / len(word_token1) * 100
+    return round(overlap_percentage, 3)
+def calculate_jaccard(word_tokens1: list, word_tokens2: list) -> float:
+    """Calculates intersection over union and return Jaccard similarity score"""
+    list1, list2 = remove_numbers(word_tokens1), remove_numbers(word_tokens2)
+    list1, list2 = remove_stop_words(list1), remove_stop_words(list2)
+    list1, list2 = lemmatize(list1), lemmatize(list2)
+    # Combine both tokens to find union
+    both_tokens = list1 + list2
+    union = set(both_tokens)
+    # Calculate intersection
+    intersection = set()
+    for word in list1:
+        if word in list2:
+            intersection.add(word)
+    jaccard_score = len(intersection) / len(union)
+    return round(jaccard_score, 3)