babilonczyk commited on
Commit
1cab606
·
verified ·
1 Parent(s): b48bc49

Upload 14 files

Browse files
models/__init__.py ADDED
File without changes
models/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (157 Bytes). View file
 
models/__pycache__/esm_2_650m.cpython-313.pyc ADDED
Binary file (1.33 kB). View file
 
models/esm_2_650m.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModel
2
+ import torch
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
5
+ model = AutoModel.from_pretrained("facebook/esm2_t33_650M_UR50D")
6
+
7
+
8
+ # -------------------------------------------------------------------------------------------------------
9
+ # get_embedding:
10
+ # This function takes a protein sequence (like "MKTFFV...") and turns it into a single vector (embedding)
11
+ # using the ESM-2 language model for proteins. This vector is like a unique "fingerprint" of the sequence
12
+ # and can be compared with others using cosine similarity.
13
+
14
+ # Here's how it works step by step:
15
+
16
+ # 1. Tokenization (turn text into numbers):
17
+ # The input sequence (a string of amino acids) is turned into tokens that the model understands.
18
+ # The tokenizer:
19
+ # - Adds special tokens like [CLS] at the beginning (used as a summary marker)
20
+ # - Pads or truncates if needed
21
+ # - Returns a PyTorch tensor with shape [1, sequence_length] so the model can process it.
22
+
23
+ # 2. Model Inference (generate the "hidden states" or embeddings):
24
+ # We feed the tokenized input into the ESM-2 model. It outputs a 3D tensor:
25
+ # [batch_size, sequence_length, embedding_dim] → e.g. [1, 35, 1280]
26
+ # This means: for each of the 35 tokens (amino acids + [CLS]), we get a 1280-dimensional vector
27
+ # that captures its meaning based on the entire sequence (like understanding a word in context).
28
+
29
+ # 3. Embedding Extraction:
30
+ # We extract two types of vectors:
31
+ # - CLS vector (position 0): a single vector meant to summarize the entire sequence
32
+ # - Mean vector: we average all the other vectors (ignoring CLS) to get a smoothed-out view of the sequence
33
+
34
+ # 4. Feature Fusion (merge the summary + content vectors):
35
+ # We concatenate the CLS vector and the mean vector, so our final embedding includes:
36
+ # - Global summary (CLS)
37
+ # - Averaged local context (mean)
38
+ # This creates a more informative representation than using only one of them.
39
+
40
+ # 5. Normalization (make comparison fair):
41
+ # We convert the final vector into a unit vector — meaning its length becomes 1.
42
+ # This is essential for cosine similarity to work properly — we want to compare direction, not magnitude.
43
+
44
+ # Output:
45
+ # A NumPy array representing the final embedding for the input protein sequence.
46
+ # This vector can now be used for comparing sequences, clustering, or feeding into ML models.
47
+
48
+
49
+ def get_embedding(sequence: str):
50
+ tokens = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True)
51
+ with torch.no_grad():
52
+ outputs = model(**tokens)
53
+
54
+ cls_vec = outputs.last_hidden_state[:, 0, :] # [CLS] token
55
+ mean_vec = outputs.last_hidden_state[:, 1:, :].mean(dim=1) # Skip [CLS]
56
+
57
+ # Concatenate CLS + mean
58
+ embedding = torch.cat([cls_vec, mean_vec], dim=-1).squeeze()
59
+
60
+ # Normalize the embedding (unit vector)
61
+ embedding = embedding / embedding.norm()
62
+
63
+ return embedding.numpy()
tests/__init__.py ADDED
File without changes
tests/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (156 Bytes). View file
 
tests/models/__init__.py ADDED
File without changes
tests/models/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (163 Bytes). View file
 
tests/models/__pycache__/test_esm_w_650m.cpython-313-pytest-8.4.1.pyc ADDED
Binary file (5.12 kB). View file
 
tests/models/test_esm_w_650m.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from models.esm_2_650m import get_embedding
3
+
4
+
5
+ def test_get_embedding_shape_and_type():
6
+ # Example short protein sequence
7
+ sequence = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQ"
8
+ embedding = get_embedding(sequence)
9
+
10
+ assert isinstance(embedding, np.ndarray)
11
+ assert embedding.ndim == 1
12
+ assert embedding.shape[0] in [1280, 2560]
13
+ assert embedding.dtype == np.float32 or embedding.dtype == np.float64
tests/utils/__init__.py ADDED
File without changes
tests/utils/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (162 Bytes). View file
 
tests/utils/__pycache__/test_compare_embeddings.cpython-313-pytest-8.4.1.pyc ADDED
Binary file (7.83 kB). View file
 
tests/utils/test_compare_embeddings.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from utils import compare_embeddings
3
+
4
+
5
+ def test_very_high_similarity():
6
+ emb1 = np.array([0.1, 0.2, 0.3])
7
+ emb2 = np.array([0.1, 0.2, 0.3])
8
+ similarity, classification = compare_embeddings(emb1, emb2)
9
+
10
+ assert similarity >= 0.85
11
+ assert classification == "very high similarity (clear homology)"
12
+
13
+
14
+ def test_high_similarity():
15
+ emb1 = np.array([1, 0, 0])
16
+ emb2 = np.array([0.8, 0.6, 0])
17
+ similarity, classification = compare_embeddings(emb1, emb2)
18
+
19
+ assert 0.70 <= similarity < 0.85
20
+ assert classification == "high similarity (likely homologous)"
21
+
22
+
23
+ def test_moderate_similarity():
24
+ emb1 = np.array([1, 0, 0])
25
+ emb2 = np.array([0.6, 0.6, 0.6])
26
+ similarity, classification = compare_embeddings(emb1, emb2)
27
+
28
+ assert 0.50 <= similarity < 0.70
29
+ assert classification == "moderate similarity (possible remote homolog)"
30
+
31
+
32
+ def test_low_similarity():
33
+ emb1 = np.array([1, 0, 0])
34
+ emb2 = np.array([0.3, 0.95, 0])
35
+ similarity, classification = compare_embeddings(emb1, emb2)
36
+
37
+ assert 0.30 <= similarity < 0.50
38
+ assert classification == "low similarity (likely not homologous)"
39
+
40
+
41
+ def test_very_low_similarity():
42
+ emb1 = np.array([1, 0, 0])
43
+ emb2 = np.array([0, 1, 0])
44
+ similarity, classification = compare_embeddings(emb1, emb2)
45
+
46
+ assert similarity < 0.30
47
+ assert classification == "very low similarity (unrelated / random match)"