razaimam45 commited on Nov 3, 2025

Commit

a96891a

verified ·

1 Parent(s): 110f995

Upload 108 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
BetaMixture.py +187 -0
LICENSE +21 -0
README.md +137 -3
baselines.py +51 -0
clip/__init__.py +2 -0
clip/__pycache__/__init__.cpython-310.pyc +0 -0
clip/__pycache__/__init__.cpython-312.pyc +0 -0
clip/__pycache__/__init__.cpython-38.pyc +0 -0
clip/__pycache__/__init__.cpython-39.pyc +0 -0
clip/__pycache__/clip.cpython-310.pyc +0 -0
clip/__pycache__/clip.cpython-312.pyc +0 -0
clip/__pycache__/clip.cpython-38.pyc +0 -0
clip/__pycache__/clip.cpython-39.pyc +0 -0
clip/__pycache__/cocoop.cpython-310.pyc +0 -0
clip/__pycache__/cocoop.cpython-312.pyc +0 -0
clip/__pycache__/cocoop.cpython-39.pyc +0 -0
clip/__pycache__/custom_clip.cpython-310.pyc +0 -0
clip/__pycache__/custom_clip.cpython-312.pyc +0 -0
clip/__pycache__/custom_clip.cpython-39.pyc +0 -0
clip/__pycache__/custom_medclip.cpython-310.pyc +0 -0
clip/__pycache__/custom_medclip.cpython-312.pyc +0 -0
clip/__pycache__/custom_medclip.cpython-39.pyc +0 -0
clip/__pycache__/model.cpython-310.pyc +0 -0
clip/__pycache__/model.cpython-312.pyc +0 -0
clip/__pycache__/model.cpython-38.pyc +0 -0
clip/__pycache__/model.cpython-39.pyc +0 -0
clip/__pycache__/simple_tokenizer.cpython-310.pyc +0 -0
clip/__pycache__/simple_tokenizer.cpython-312.pyc +0 -0
clip/__pycache__/simple_tokenizer.cpython-38.pyc +0 -0
clip/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
clip/bpe_simple_vocab_16e6.txt.gz +3 -0
clip/clip.py +232 -0
clip/cocoop.py +234 -0
clip/custom_clip.py +388 -0
clip/custom_medclip.py +389 -0
clip/model.py +438 -0
clip/simple_tokenizer.py +132 -0
data/__init__.py +0 -0
data/__pycache__/__init__.cpython-310.pyc +0 -0
data/__pycache__/__init__.cpython-311.pyc +0 -0
data/__pycache__/__init__.cpython-312.pyc +0 -0
data/__pycache__/__init__.cpython-39.pyc +0 -0
data/__pycache__/augmix_ops.cpython-310.pyc +0 -0
data/__pycache__/augmix_ops.cpython-311.pyc +0 -0
data/__pycache__/augmix_ops.cpython-312.pyc +0 -0
data/__pycache__/augmix_ops.cpython-39.pyc +0 -0
data/__pycache__/cls_to_names.cpython-310.pyc +0 -0
data/__pycache__/cls_to_names.cpython-312.pyc +0 -0
data/__pycache__/cls_to_names.cpython-39.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+figures/eval.png filter=lfs diff=lfs merge=lfs -text
+figures/method.png filter=lfs diff=lfs merge=lfs -text
+figures/mi_v_ent.png filter=lfs diff=lfs merge=lfs -text
+figures/results.png filter=lfs diff=lfs merge=lfs -text

BetaMixture.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import numpy as np
+from scipy.special import betaln, logsumexp
+from sklearn.cluster import KMeans
+class BetaMixtureModel:
+    """
+    Beta Mixture Model (Multivariate version).
+    Each dimension is modeled independently by a Beta distribution.
+    """
+    def __init__(self, n_mixtures=3, random_seed=1):
+        self.n_mixtures = n_mixtures
+        self.random_seed = random_seed
+        self.convergence = False
+    def _init_clusters(self, data_matrix, init_round):
+        """
+        Initialize the mixture responsibilities (assignments) via k-means or uniformly random
+        """
+        if self.method == "kmeans":
+            km = KMeans(
+                n_clusters=self.n_mixtures,
+                n_init=1,
+                random_state=self.random_seed + init_round
+            ).fit(data_matrix)
+            resp_matrix = np.zeros((self.n_observations, self.n_mixtures))
+            resp_matrix[np.arange(self.n_observations), km.labels_] = 1
+        else:
+            np.random.seed(self.random_seed + init_round)
+            resp_matrix = np.random.rand(self.n_observations, self.n_mixtures)
+            resp_matrix /= resp_matrix.sum(axis=1, keepdims=True)
+        # Numerical stability
+        resp_matrix += 10 * np.finfo(resp_matrix.dtype).eps
+        # Initialize beta parameters (alpha/beta for each dimension)
+        self.beta_params_ = np.zeros((self.n_mixtures, self.n_components * 2))
+        self._M_step(data_matrix, np.log(resp_matrix))
+    def _calc_log_weights(self):
+        """
+        Return log of current mixture weights.
+        """
+        return np.log(self.mix_weights_)
+    def _calc_mixture_log_probs(self, data_matrix, mixture_idx):
+        """
+        Compute log-prob for a single mixture (used if parallelized).
+        """
+        alpha_vec = self.beta_params_[mixture_idx, :self.n_components]
+        beta_vec = self.beta_params_[mixture_idx, self.n_components:]
+        beta_func_log = betaln(alpha_vec, beta_vec)
+        return (
+            (alpha_vec - 1) * np.log(data_matrix)
+            + (beta_vec - 1) * np.log(1 - data_matrix)
+            - beta_func_log
+        ).sum(axis=1)
+    def _calc_log_probs_all_mixtures(self, data_matrix):
+        """
+        Return log-prob for each observation under each mixture (unnormalized).
+        """
+        log_prob = np.empty((self.n_observations, self.n_mixtures))
+        for mix in range(self.n_mixtures):
+            alpha_vec = self.beta_params_[mix, :self.n_components]
+            beta_vec = self.beta_params_[mix, self.n_components:]
+            bfn = betaln(alpha_vec, beta_vec)
+            log_prob[:, mix] = (
+                (alpha_vec - 1) * np.log(data_matrix)
+                + (beta_vec - 1) * np.log(1 - data_matrix)
+                - bfn
+            ).sum(axis=1)
+        return log_prob
+    def _calc_weighted_log_probs(self, data_matrix):
+        """
+        Return the sum of log-probabilities and log-weights.
+        """
+        return self._calc_log_probs_all_mixtures(data_matrix) + self._calc_log_weights()
+    def _calc_log_resp_and_norm(self, data_matrix):
+        """
+        Return (log_prob_norm, log_resp) for the E-step.
+        """
+        weighted_lp = self._calc_weighted_log_probs(data_matrix)
+        lp_norm = logsumexp(weighted_lp, axis=1)
+        with np.errstate(under="ignore"):
+            log_resp = weighted_lp - lp_norm[:, None]
+        return lp_norm, log_resp
+    def _E_step(self, data_matrix):
+        """
+        E-step: compute average log_prob_norm and log_resp.
+        """
+        lp_norm, log_resp = self._calc_log_resp_and_norm(data_matrix)
+        return np.mean(lp_norm), log_resp
+    def _compute_responsibilities(self, log_resp):
+        """
+        Exponentiate log_resp and sum across observations.
+        """
+        resp_matrix = np.exp(log_resp)
+        cluster_counts = resp_matrix.sum(axis=0) + 10 * np.finfo(resp_matrix.dtype).eps
+        return resp_matrix, cluster_counts
+    def _update_mixture_weights(self, cluster_counts):
+        """
+        Update mixture weights from mixture counts.
+        """
+        self.mix_weights_ = cluster_counts / cluster_counts.sum()
+    def _M_step(self, data_matrix, log_resp):
+        """
+        M-step: update weights and Beta distribution parameters via moment matching.
+        """
+        resp_matrix, cluster_counts = self._compute_responsibilities(log_resp)
+        self._update_mixture_weights(cluster_counts)
+        w_sums = resp_matrix.T @ data_matrix
+        w_sums_sq = resp_matrix.T @ (data_matrix ** 2)
+        for m_idx in range(self.n_mixtures):
+            sum_vals = w_sums[m_idx]
+            sum_sq_vals = w_sums_sq[m_idx]
+            mean_val = sum_vals / cluster_counts[m_idx]
+            var_val = sum_sq_vals / cluster_counts[m_idx] - mean_val ** 2
+            # Clip variance
+            variance_cap = mean_val * (1 - mean_val) / 4
+            var_val = np.minimum(var_val, variance_cap)
+            var_val += 10 * np.finfo(var_val.dtype).eps
+            # Compute factor
+            scaling_factor = (mean_val * (1 - mean_val)) / (var_val + 1e-10) - 1
+            self.beta_params_[m_idx, :self.n_components] = scaling_factor * mean_val
+            self.beta_params_[m_idx, self.n_components:] = scaling_factor * (1 - mean_val)
+    def fit(self, data_matrix, num_init=3, method="kmeans", max_iter=1000, tol=1e-4):
+        """
+        Fit BetaMixtureModel to the data using EM, possibly with multiple initializations.
+        """
+        self.n_observations, self.n_components = data_matrix.shape
+        self.convergence = False
+        self.method = method
+        best_lower_bound = -np.inf
+        optimal_params = None
+        for init_round in range(num_init):
+            # print(f"{init_round + 1}-th BMM initialization")
+            self._init_clusters(data_matrix, init_round)
+            ll_bound = -np.inf
+            for _ in range(max_iter):
+                prev_bound = ll_bound
+                lp_norm, log_resp = self._E_step(data_matrix)
+                self._M_step(data_matrix, log_resp)
+                ll_bound = lp_norm
+                delta_bound = ll_bound - prev_bound
+                if abs(delta_bound) < tol:
+                    self.convergence = True
+                    break
+            if ll_bound > best_lower_bound:
+                best_lower_bound = ll_bound
+                # Update final weights
+                _, cluster_counts = self._compute_responsibilities(log_resp)
+                self._update_mixture_weights(cluster_counts)
+                optimal_params = (self.mix_weights_.copy(), self.beta_params_.copy())
+        self.mix_weights_, self.beta_params_ = optimal_params
+        self.max_lower_bound = best_lower_bound
+        return self
+    def predict_proba(self, data_matrix):
+        """
+        Return the per-mixture membership probabilities for each sample.
+        """
+        _, log_resp = self._calc_log_resp_and_norm(data_matrix)
+        return np.exp(log_resp)
+    def predict(self, data_matrix):
+        """
+        Return the most probable mixture index for each sample.
+        """
+        return np.argmax(self.predict_proba(data_matrix), axis=1)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 razaimam45
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,137 @@
----
-license: mit
----

+# T³: Test-Time Model Merging for Medical Vision-Language Models
+![T³ Workflow](figures/method.png)
+*Figure 1: Dynamic test-time merging workflow of T³*
+Official implementation of **T³: Test-Time Model Merging in Vision-Language Models for Zero-Shot Medical Imaging**, a method for adaptive fusion of pretrained and fine-tuned vision-language models at test time using Jensen-Shannon divergence.
+---
+## Key Features
+- 🧠 **Mutual Information Guidance**: Uses JS divergence to measure model consensus.
+- ⚡ **Backpropagation-Free**: No gradient updates required during inference.
+- 🏥 **Medical Modality Agnostic**: Validated consistency on 4x medical imaging domains.
+- 🚀 **Batch-Wise Efficiency**: Reduces compute cost by 32x vs sample-wise merging.
+- 📈 **SOTA Performance**: Outperforms 8+ baselines in accuracy & robustness.
+---
+## Table of Contents
+- [Installation](#installation)
+- [Method Overview](#method-overview)
+- [Folder Structure](#folder-structure)
+- [Reproducing Results](#reproducing-results)
+- [Pretrained Weights](#pretrained-weights)
+- [Citation](#citation)
+## Installation
+1. Clone repository:
+```bash
+git clone https://github.com/yourusername/T3.git
+cd T3
+```
+2. Create conda environment:
+```bash
+conda create -n t3 python=3.9
+conda activate t3
+pip install -r requirements.txt
+```
+## Method Overview
+### Adaptive Merging via Jensen-Shannon Divergence
+The interpolation coefficient λ is computed dynamically for each sample using the following equation:
+```math
+λ(x) = λ_{min} + (λ_{max}-λ_{min})σ(γ⋅JS(p_{pt}(x)‖p_{ft}(x)))
+```
+Where:
+- `JS` = Jensen-Shannon divergence between pretrained and fine-tuned model predictions.
+- `σ` = Sigmoid function for smooth scaling.
+- `γ` = Scaling factor (default=0.5).
+### Visual Explanation of the Method
+Below justifies the method and its effectiveness:
+### Dynamic Weighting Based on Model Agreement
+We propose using Jensen–Shannon (JS) divergence to measure mutual information between pretrained (`p_pt`) and fine-tuned (`p_ft`) model predictions, offering a more robust gauge of joint confidence than entropy-based methods like DaWin's entropy ratio:
+```math
+R(x) = \frac{\mathcal{H}(p_{ft}(x))}{\mathcal{H}(p_{pt}(x)) + \mathcal{H}(p_{ft}(x))}
+```
+JS divergence explicitly captures agreement vs. disagreement by comparing full predictive distributions:
+```math
+I(x) = \frac{1}{2} \Bigl(\mathrm{KL}(p_{pt}(x) \Vert \bar{p}(x)) + \mathrm{KL}(p_{ft}(x) \Vert \bar{p}(x))\Bigr)
+```
+where
+```math
+\bar{p}(x) = 0.5 \cdot (p_{pt}(x) + p_{ft}(x))`.
+```
+ This ensures:
+- \(I(x) = 0\) when models fully agree.
+- \(I(x) > 0\) when confident predictions disagree.
+Empirically, \(I(x)\) correlates positively with \(R(x)\), but better distinguishes disagreements, validating its use for adaptive merging.
+2. **Mutual Information vs. Entropy**
+   ![MI vs Entropy](figures/mi_v_ent.png)
+   *Figure 3: Relationship between mutual information and entropy for adaptive merging.*
+3. **Performance Across Modalities**
+   ![Performance Comparison](figures/results.png)
+   *Figure 4: T³ achieves superior performance across multiple medical imaging modalities.*
+---
+## Folder Structure
+```
+T3/
+├── clip/              # CLIP model adaptations
+├── data/              # Data Utilities
+├── utils/             # Helper functions
+├── baselines.py       # Comparison methods
+├── t_cube.py          # Core T³ implementation
+├── BetaMixture.py     # Auxiliary models
+└── README.md          # This document
+```
+---
+## Reproducing Results
+To reproduce the results from the paper, you can run the `t_cube.py` script. This script handles the evaluation of T³ and its baselines across multiple datasets and severity levels. Additional baselines are available in `baselines.py`.
+To understand the script better:
+- Refer to the `compute_samplewise_tcube_weights` and `compute_samplewise_tcube_weights_MI` functions for entropy (DaWiN baseline) and Our mutual information-based merging.
+- Check the `evaluate_on_test_set` function for how datasets and severities are processed.
+- Explore the `evaluate_tcube` function for the merging and evaluation logic.
+---
+## Pretrained Weights
+We provide pretrained weights for the following models:
+1. **Generalist CLIP**: A pretrained model for general vision-language tasks.
+2. **Expert CLIPs**: 4x Fine-tuned models for the following medical imaging domains:
+   - Breast Imaging
+   - Fundoscopy
+   - Cell Microscopy
+   - Retinal OCT
+If you would like access to these weights, please contact us directly at [Raza Imam](mailto:[email protected]).
+---
+## License
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## Contact
+For questions or collaborations, contact [Raza Imam](mailto:[email protected]).

baselines.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import copy
+import numpy as np
+from scipy.stats import pearsonr
+from t_cube import evaluate_model
+def evaluate_slerp(clip_pt, sd_pt, sd_ft, dataloader, args, alpha=0.5):
+    """
+    SLERP (spherical linear interpolation) between pretrained (pt) and fine-tuned (ft) weights.
+    alpha=0 -> pt only; alpha=1 -> ft only.
+    """
+    model = copy.deepcopy(clip_pt)
+    merged_sd = {}
+    # flatten-per-key SLERP
+    for k in sd_pt.keys():
+        w1 = sd_pt[k].flatten().float()
+        w2 = sd_ft[k].flatten().float()
+        # cosine similarity
+        cos_val = torch.dot(w1, w2) / (w1.norm() * w2.norm() + 1e-8)
+        omega = torch.acos(torch.clamp(cos_val, -1+1e-6, 1-1e-6))
+        sin_omega = torch.sin(omega)
+        if sin_omega < 1e-6:
+            w_interp = (1-alpha)*w1 + alpha*w2
+        else:
+            w_interp = (torch.sin((1-alpha)*omega)/sin_omega)*w1 + \
+                       (torch.sin(alpha*omega)/sin_omega)*w2
+        merged_sd[k] = w_interp.view_as(sd_pt[k])
+    model.load_state_dict(merged_sd)
+    return evaluate_model(model, dataloader, args)
+def evaluate_m3(clip_pt, sd_pt, sd_ft, dataloader, args):
+    """
+    M^3 (Mixup Model Merge): sample lambda ~ Uniform(0,1) and do linear interpolation.
+    """
+    model = copy.deepcopy(clip_pt)
+    lam = np.random.rand()
+    merged_sd = {k: lam * sd_ft[k] + (1 - lam) * sd_pt[k]
+                 for k in sd_pt.keys()}
+    model.load_state_dict(merged_sd)
+    return evaluate_model(model, dataloader, args)
+def evaluate_task_arithmetic(clip_pt, sd_pt, sd_ft, dataloader, args):
+    """
+    Task Arithmetic: extrapolate along the ft−pt vector, i.e. 2*ft – pt.
+    """
+    model = copy.deepcopy(clip_pt)
+    merged_sd = {k: 2 * sd_ft[k] - sd_pt[k] for k in sd_pt.keys()}
+    model.load_state_dict(merged_sd)
+    return evaluate_model(model, dataloader, args)

clip/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .clip import *
2	+ from .custom_clip import *

clip/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

clip/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (202 Bytes). View file

clip/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (189 Bytes). View file

clip/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (188 Bytes). View file

clip/__pycache__/clip.cpython-310.pyc ADDED Viewed

Binary file (8.43 kB). View file

clip/__pycache__/clip.cpython-312.pyc ADDED Viewed

Binary file (13.6 kB). View file

clip/__pycache__/clip.cpython-38.pyc ADDED Viewed

Binary file (8.34 kB). View file

clip/__pycache__/clip.cpython-39.pyc ADDED Viewed

Binary file (8.4 kB). View file

clip/__pycache__/cocoop.cpython-310.pyc ADDED Viewed

Binary file (7.4 kB). View file

clip/__pycache__/cocoop.cpython-312.pyc ADDED Viewed

Binary file (13 kB). View file

clip/__pycache__/cocoop.cpython-39.pyc ADDED Viewed

Binary file (7.44 kB). View file

clip/__pycache__/custom_clip.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

clip/__pycache__/custom_clip.cpython-312.pyc ADDED Viewed

Binary file (19.5 kB). View file

clip/__pycache__/custom_clip.cpython-39.pyc ADDED Viewed

Binary file (10.6 kB). View file

clip/__pycache__/custom_medclip.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

clip/__pycache__/custom_medclip.cpython-312.pyc ADDED Viewed

Binary file (18.4 kB). View file

clip/__pycache__/custom_medclip.cpython-39.pyc ADDED Viewed

Binary file (10 kB). View file

clip/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (15.2 kB). View file

clip/__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (29.8 kB). View file

clip/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (15 kB). View file

clip/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (15 kB). View file

clip/__pycache__/simple_tokenizer.cpython-310.pyc ADDED Viewed

Binary file (5.7 kB). View file

clip/__pycache__/simple_tokenizer.cpython-312.pyc ADDED Viewed

Binary file (8.92 kB). View file

clip/__pycache__/simple_tokenizer.cpython-38.pyc ADDED Viewed

Binary file (5.79 kB). View file

clip/__pycache__/simple_tokenizer.cpython-39.pyc ADDED Viewed

Binary file (5.75 kB). View file

clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

clip/clip.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, Union, List
+from pkg_resources import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+if packaging.version.parse(torch.__version__) < packaging.version.parse("1.7.1"):
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+}
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit: bool = False, download_root: str = None):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name], download_root or os.path.expanduser("~/.cache/clip"))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    embed_dim = model.state_dict()["text_projection"].shape[1]
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, embed_dim, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, embed_dim, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77, truncate: bool = False) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+            else:
+                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result

clip/cocoop.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from collections import OrderedDict
+from typing import Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from clip import load, tokenize
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from .custom_clip import TextEncoder
+from data.imagnet_prompts import imagenet_classes
+from data.cls_to_names import *
+from data.fewshot_datasets import fewshot_datasets
+_tokenizer = _Tokenizer()
+DOWNLOAD_ROOT='~/.cache/clip'
+class CoCoOpPromptLearner(nn.Module):
+    def __init__(self, clip_model, classnames, n_ctx=4, ctx_init="a_photo_of_a", ctx_position='end'):
+        super().__init__()
+        n_cls = len(classnames)
+        dtype = clip_model.dtype
+        self.dtype = dtype
+        self.device = clip_model.visual.conv1.weight.device
+        ctx_dim = clip_model.ln_final.weight.shape[0]
+        embed_dim = clip_model.text_projection.shape[1]
+        self.ctx_dim = ctx_dim
+        if ctx_init:
+            # use given words to initialize context vectors
+            print("Initializing the contect with given words: [{}]".format(ctx_init))
+            ctx_init = ctx_init.replace("_", " ")
+            n_ctx = len(ctx_init.split(" "))
+            prompt = tokenize(ctx_init).to(self.device)
+            with torch.no_grad():
+                embedding = clip_model.token_embedding(prompt).type(dtype)
+            ctx_vectors = embedding[0, 1 : 1 + n_ctx, :]
+            prompt_prefix = ctx_init
+        else:
+            print("Random initialization: initializing a generic context")
+            ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
+            nn.init.normal_(ctx_vectors, std=0.02)
+            prompt_prefix = " ".join(["X"] * n_ctx)
+        print(f'Initial context: "{prompt_prefix}"')
+        print(f"Number of context words (tokens): {n_ctx}")
+        self.prompt_prefix = prompt_prefix
+        self.ctx = nn.Parameter(ctx_vectors) # to be optimized
+        self.meta_net = nn.Sequential(OrderedDict([
+            ("linear1", nn.Linear(embed_dim, embed_dim // 16)),
+            ("relu", nn.ReLU(inplace=True)),
+            ("linear2", nn.Linear(embed_dim // 16, ctx_dim))
+        ]))
+        classnames = [name.replace("_", " ") for name in classnames]
+        name_lens = [len(_tokenizer.encode(name)) for name in classnames]
+        prompts = [prompt_prefix + " " + name + "." for name in classnames]
+        tokenized_prompts = torch.cat([tokenize(p) for p in prompts]).to(self.device)
+        with torch.no_grad():
+            embedding = clip_model.token_embedding(tokenized_prompts).type(dtype)
+        # These token vectors will be saved when in save_model(),
+        # but they should be ignored in load_model() as we want to use
+        # those computed using the current class names
+        self.register_buffer("token_prefix", embedding[:, :1, :])  # SOS
+        self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :])  # CLS, EOS
+        self.ctx_init = ctx_init
+        self.tokenized_prompts = tokenized_prompts  # torch.Tensor
+        self.name_lens = name_lens
+        self.class_token_position = ctx_position
+        self.n_cls = n_cls
+        self.n_ctx = n_ctx
+    def construct_prompts(self, ctx, prefix, suffix, label=None):
+        # dim0 is either batch_size (during training) or n_cls (during testing)
+        # ctx: context tokens, with shape of (dim0, n_ctx, ctx_dim)
+        # prefix: the sos token, with shape of (n_cls, 1, ctx_dim)
+        # suffix: remaining tokens, with shape of (n_cls, *, ctx_dim)
+        if label is not None:
+            prefix = prefix[label]
+            suffix = suffix[label]
+        prompts = torch.cat(
+            [
+                prefix,  # (dim0, 1, dim)
+                ctx,     # (dim0, n_ctx, dim)
+                suffix,  # (dim0, *, dim)
+            ],
+            dim=1,
+        )
+        return prompts
+    def reset_classnames(self, classnames, arch):
+        self.n_cls = len(classnames)
+        classnames = [name.replace("_", " ") for name in classnames]
+        name_lens = [len(_tokenizer.encode(name)) for name in classnames]
+        prompts = [self.prompt_prefix + " " + name + "." for name in classnames]
+        tokenized_prompts = torch.cat([tokenize(p) for p in prompts]).to(self.device)
+        clip, _, _ = load(arch, device=self.device, download_root=DOWNLOAD_ROOT)
+        with torch.no_grad():
+            embedding = clip.token_embedding(tokenized_prompts).type(self.dtype)
+        self.token_prefix = embedding[:, :1, :]
+        self.token_suffix = embedding[:, 1 + self.n_ctx :, :]  # CLS, EOS
+        self.name_lens = name_lens
+        self.tokenized_prompts = tokenized_prompts
+    def forward(self, im_features, ctx_only=False):
+        prefix = self.token_prefix
+        suffix = self.token_suffix
+        ctx = self.ctx                     # (n_ctx, ctx_dim)
+        bias = self.meta_net(im_features)  # (batch, ctx_dim)
+        bias = bias.unsqueeze(1)           # (batch, 1, ctx_dim)
+        ctx = ctx.unsqueeze(0)             # (1, n_ctx, ctx_dim)
+        ctx_shifted = ctx + bias           # (batch, n_ctx, ctx_dim)
+        if ctx_only:
+            return ctx_shifted # don't expand to n_cls, optimize one ctx for all classes
+        # Use instance-conditioned context tokens for all classes
+        prompts = []
+        for ctx_shifted_i in ctx_shifted:
+            ctx_i = ctx_shifted_i.unsqueeze(0).expand(self.n_cls, -1, -1)
+            pts_i = self.construct_prompts(ctx_i, prefix, suffix)  # (n_cls, n_tkn, ctx_dim)
+            prompts.append(pts_i)
+        prompts = torch.stack(prompts)
+        return prompts
+class CoCoOpCLIP(nn.Module):
+    def __init__(self, device, classnames, criterion='cosine', arch="ViT-L/14",
+                        n_ctx=16, ctx_init="a_photo_of_a", ctx_position='end'):
+        super().__init__()
+        clip, _, _ = load(arch, device=device, download_root=DOWNLOAD_ROOT)
+        self.image_encoder = clip.visual
+        self.text_encoder = TextEncoder(clip)
+        self.logit_scale = clip.logit_scale.data
+        # prompt tuning
+        self.prompt_generator = CoCoOpPromptLearner(clip, classnames, n_ctx, ctx_init, ctx_position)
+        self.tokenized_prompts = self.prompt_generator.tokenized_prompts
+        self.criterion = criterion
+        self.dtype = clip.dtype
+    def inference(self, image, label=None):
+        tokenized_prompts = self.prompt_generator.tokenized_prompts
+        logit_scale = self.logit_scale.exp()
+        image_features = self.image_encoder(image.type(self.dtype))
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        prompts = self.prompt_generator(image_features)
+        logits = []
+        for pts_i, imf_i in zip(prompts, image_features):
+            text_features = self.text_encoder(pts_i, tokenized_prompts)
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            l_i = logit_scale * imf_i @ text_features.t()
+            logits.append(l_i)
+        logits = torch.stack(logits)
+        return logits
+    def gen_ctx(self, image, aug=False):
+        with torch.no_grad():
+            with torch.cuda.amp.autocast():
+                image_features = self.image_encoder(image.type(self.dtype))
+                if aug:
+                    image_feature_avg = image_features[0].unsqueeze(0)
+                else:
+                    image_feature_avg = image_features.mean(dim=0, keepdim=True)
+                ctx = self.prompt_generator(image_feature_avg, ctx_only=True)
+        return image_features, ctx.detach().clone()
+    def forward_ctx(self, image_features, ctx):
+        N = 1
+        prefix = self.prompt_generator.token_prefix.expand(N, -1, -1, -1) # [N, n_cls, 1, dim]
+        suffix = self.prompt_generator.token_suffix.expand(N, -1, -1, -1)
+        # expand `ctx` n_cls times
+        ctx = ctx.expand(self.prompt_generator.n_cls, -1, -1, -1)
+        ctx = ctx.permute(1, 0, 2, 3)
+        # ctx = ctx.reshape(N, self.prompt_generator.n_cls, -1, self.prompt_generator.ctx_dim)
+        prompts = torch.cat([
+            prefix,
+            ctx,
+            suffix
+        ], dim=-2)
+        # full_n_ctx = prompts.size()[-2]
+        prompts = prompts.reshape(N * self.prompt_generator.n_cls, -1, self.prompt_generator.ctx_dim)
+        tokenized_prompts = self.prompt_generator.tokenized_prompts
+        tokenized_prompts = tokenized_prompts.repeat(N, 1)
+        text_features = self.text_encoder(prompts, tokenized_prompts)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features.reshape(N, -1, image_features.size()[-1])
+        logit_scale = self.logit_scale.exp()
+        text_features = text_features.squeeze(0)
+        logits = logit_scale * image_features @ text_features.t()
+        return logits
+    def forward(self, input):
+        if isinstance(input, Tuple):
+            image_features, ctx = input
+            return self.forward_ctx(image_features, ctx)
+        else:
+            return self.inference(input)
+def get_cocoop(clip_arch, test_set, device, n_ctx):
+    if test_set in fewshot_datasets:
+        classnames = eval("{}_classes".format(test_set.lower()))
+    else:
+        classnames = imagenet_classes
+    model = CoCoOpCLIP(device, classnames, arch=clip_arch, n_ctx=n_ctx)
+    return model

clip/custom_clip.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import math
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from clip import load, tokenize
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from data.imagnet_prompts import imagenet_classes
+from data.fewshot_datasets import fewshot_datasets
+from data.cls_to_names import *
+from utils.ModelStock import stock_model
+_tokenizer = _Tokenizer()
+DOWNLOAD_ROOT='~/.cache/clip'
+class ClipImageEncoder(nn.Module):
+    def __init__(self, device, arch="ViT-L/14", image_resolution=224, n_class=1000):
+        super(ClipImageEncoder, self).__init__()
+        clip, embed_dim, _ = load(arch, device=device, download_root=DOWNLOAD_ROOT)
+        self.encoder = clip.visual
+        del clip.transformer
+        torch.cuda.empty_cache()
+        self.cls_head = nn.Linear(embed_dim, n_class)
+    @property
+    def dtype(self):
+        return self.encoder.conv1.weight.dtype
+    def forward(self, image):
+        x = self.encoder(image.type(self.dtype))
+        output = self.cls_head(x)
+        return output
+class TextEncoder(nn.Module):
+    def __init__(self, clip_model):
+        super().__init__()
+        self.transformer = clip_model.transformer
+        self.positional_embedding = clip_model.positional_embedding
+        self.ln_final = clip_model.ln_final
+        self.text_projection = clip_model.text_projection
+        self.dtype = clip_model.dtype
+    def forward(self, prompts, tokenized_prompts):
+        x = prompts + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
+        return x
+class PromptLearner(nn.Module):
+    def __init__(self, clip_model, classnames, batch_size=None, n_ctx=16, ctx_init=None, ctx_position='end', learned_cls=False):
+        super().__init__()
+        n_cls = len(classnames)
+        self.learned_cls = learned_cls
+        dtype = clip_model.dtype
+        self.dtype = dtype
+        self.device = clip_model.visual.conv1.weight.device
+        ctx_dim = clip_model.ln_final.weight.shape[0]
+        self.ctx_dim = ctx_dim
+        self.batch_size = batch_size
+        # self.ctx, prompt_prefix = self.reset_prompt(ctx_dim, ctx_init, clip_model)
+        if ctx_init:
+            # use given words to initialize context vectors
+            print("Initializing the contect with given words: [{}]".format(ctx_init))
+            ctx_init = ctx_init.replace("_", " ")
+            if '[CLS]' in ctx_init:
+                ctx_list = ctx_init.split(" ")
+                split_idx = ctx_list.index("[CLS]")
+                ctx_init = ctx_init.replace("[CLS] ", "")
+                ctx_position = "middle"
+            else:
+                split_idx = None
+            self.split_idx = split_idx
+            n_ctx = len(ctx_init.split(" "))
+            prompt = tokenize(ctx_init).to(self.device)
+            with torch.no_grad():
+                embedding = clip_model.token_embedding(prompt).type(dtype)
+            ctx_vectors = embedding[0, 1 : 1 + n_ctx, :]
+            prompt_prefix = ctx_init
+        else:
+            print("Random initialization: initializing a generic context")
+            ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
+            nn.init.normal_(ctx_vectors, std=0.02)
+            prompt_prefix = " ".join(["X"] * n_ctx)
+        self.prompt_prefix = prompt_prefix
+        print(f'Initial context: "{prompt_prefix}"')
+        print(f"Number of context words (tokens): {n_ctx}")
+        # batch-wise prompt tuning for test-time adaptation
+        if self.batch_size is not None:
+            ctx_vectors = ctx_vectors.repeat(batch_size, 1, 1)  #(N, L, D)
+        self.ctx_init_state = ctx_vectors.detach().clone()
+        self.ctx = nn.Parameter(ctx_vectors) # to be optimized
+        if not self.learned_cls:
+            classnames = [name.replace("_", " ") for name in classnames]
+            name_lens = [len(_tokenizer.encode(name)) for name in classnames]
+            prompts = [prompt_prefix + " " + name + "." for name in classnames]
+        else:
+            print("Random initialization: initializing a learnable class token")
+            cls_vectors = torch.empty(n_cls, 1, ctx_dim, dtype=dtype) # assume each learnable cls_token is only 1 word
+            nn.init.normal_(cls_vectors, std=0.02)
+            cls_token = "X"
+            name_lens = [1 for _ in classnames]
+            prompts = [prompt_prefix + " " + cls_token + "." for _ in classnames]
+            self.cls_init_state = cls_vectors.detach().clone()
+            self.cls = nn.Parameter(cls_vectors) # to be optimized
+        tokenized_prompts = torch.cat([tokenize(p) for p in prompts]).to(self.device)
+        with torch.no_grad():
+            embedding = clip_model.token_embedding(tokenized_prompts).type(dtype)
+        # These token vectors will be saved when in save_model(),
+        # but they should be ignored in load_model() as we want to use
+        # those computed using the current class names
+        self.register_buffer("token_prefix", embedding[:, :1, :])  # SOS
+        if self.learned_cls:
+            self.register_buffer("token_suffix", embedding[:, 1 + n_ctx + 1:, :])  # ..., EOS
+        else:
+            self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :])  # CLS, EOS
+        self.ctx_init = ctx_init
+        self.tokenized_prompts = tokenized_prompts  # torch.Tensor
+        self.name_lens = name_lens
+        self.class_token_position = ctx_position
+        self.n_cls = n_cls
+        self.n_ctx = n_ctx
+        self.classnames = classnames
+    def reset(self):
+        ctx_vectors = self.ctx_init_state
+        self.ctx.copy_(ctx_vectors) # to be optimized
+        if self.learned_cls:
+            cls_vectors = self.cls_init_state
+            self.cls.copy_(cls_vectors)
+    def reset_classnames(self, classnames, arch):
+        self.n_cls = len(classnames)
+        if not self.learned_cls:
+            classnames = [name.replace("_", " ") for name in classnames]
+            name_lens = [len(_tokenizer.encode(name)) for name in classnames]
+            prompts = [self.prompt_prefix + " " + name + "." for name in classnames]
+        else:
+            cls_vectors = torch.empty(self.n_cls, 1, self.ctx_dim, dtype=self.dtype) # assume each learnable cls_token is only 1 word
+            nn.init.normal_(cls_vectors, std=0.02)
+            cls_token = "X"
+            name_lens = [1 for _ in classnames]
+            prompts = [self.prompt_prefix + " " + cls_token + "." for _ in classnames]
+            # TODO: re-init the cls parameters
+            # self.cls = nn.Parameter(cls_vectors) # to be optimized
+            self.cls_init_state = cls_vectors.detach().clone()
+        tokenized_prompts = torch.cat([tokenize(p) for p in prompts]).to(self.device)
+        clip, _, _ = load(arch, device=self.device, download_root=DOWNLOAD_ROOT)
+        with torch.no_grad():
+            embedding = clip.token_embedding(tokenized_prompts).type(self.dtype)
+        self.token_prefix = embedding[:, :1, :]
+        self.token_suffix = embedding[:, 1 + self.n_ctx :, :]  # CLS, EOS
+        self.name_lens = name_lens
+        self.tokenized_prompts = tokenized_prompts
+        self.classnames = classnames
+    def forward(self, init=None):
+        # the init will be used when computing CLIP directional loss
+        if init is not None:
+            ctx = init
+        else:
+            ctx = self.ctx
+        if ctx.dim() == 2:
+            ctx = ctx.unsqueeze(0).expand(self.n_cls, -1, -1)
+        elif not ctx.size()[0] == self.n_cls:
+            ctx = ctx.unsqueeze(1).expand(-1, self.n_cls, -1, -1)
+        prefix = self.token_prefix
+        suffix = self.token_suffix
+        if self.batch_size is not None:
+            # This way only works for single-gpu setting (could pass batch size as an argument for forward())
+            prefix = prefix.repeat(self.batch_size, 1, 1, 1)
+            suffix = suffix.repeat(self.batch_size, 1, 1, 1)
+        if self.learned_cls:
+            assert self.class_token_position == "end"
+        if self.class_token_position == "end":
+            if self.learned_cls:
+                cls = self.cls
+                prompts = torch.cat(
+                    [
+                        prefix,  # (n_cls, 1, dim)
+                        ctx,     # (n_cls, n_ctx, dim)
+                        cls,     # (n_cls, 1, dim)
+                        suffix,  # (n_cls, *, dim)
+                    ],
+                    dim=-2,
+                )
+            else:
+                prompts = torch.cat(
+                    [
+                        prefix,  # (n_cls, 1, dim)
+                        ctx,     # (n_cls, n_ctx, dim)
+                        suffix,  # (n_cls, *, dim)
+                    ],
+                    dim=-2,
+                )
+        elif self.class_token_position == "middle":
+            # TODO: to work with a batch of prompts
+            if self.split_idx is not None:
+                half_n_ctx = self.split_idx # split the ctx at the position of [CLS] in `ctx_init`
+            else:
+                half_n_ctx = self.n_ctx // 2
+            prompts = []
+            for i in range(self.n_cls):
+                name_len = self.name_lens[i]
+                prefix_i = prefix[i : i + 1, :, :]
+                class_i = suffix[i : i + 1, :name_len, :]
+                suffix_i = suffix[i : i + 1, name_len:, :]
+                ctx_i_half1 = ctx[i : i + 1, :half_n_ctx, :]
+                ctx_i_half2 = ctx[i : i + 1, half_n_ctx:, :]
+                prompt = torch.cat(
+                    [
+                        prefix_i,     # (1, 1, dim)
+                        ctx_i_half1,  # (1, n_ctx//2, dim)
+                        class_i,      # (1, name_len, dim)
+                        ctx_i_half2,  # (1, n_ctx//2, dim)
+                        suffix_i,     # (1, *, dim)
+                    ],
+                    dim=1,
+                )
+                prompts.append(prompt)
+            prompts = torch.cat(prompts, dim=0)
+        elif self.class_token_position == "front":
+            prompts = []
+            for i in range(self.n_cls):
+                name_len = self.name_lens[i]
+                prefix_i = prefix[i : i + 1, :, :]
+                class_i = suffix[i : i + 1, :name_len, :]
+                suffix_i = suffix[i : i + 1, name_len:, :]
+                ctx_i = ctx[i : i + 1, :, :]
+                prompt = torch.cat(
+                    [
+                        prefix_i,  # (1, 1, dim)
+                        class_i,   # (1, name_len, dim)
+                        ctx_i,     # (1, n_ctx, dim)
+                        suffix_i,  # (1, *, dim)
+                    ],
+                    dim=1,
+                )
+                prompts.append(prompt)
+            prompts = torch.cat(prompts, dim=0)
+        else:
+            raise ValueError
+        return prompts
+class ClipTestTimeTuning(nn.Module):
+    def __init__(self, device, classnames, batch_size, criterion='cosine', arch="ViT-L/14",
+                        n_ctx=16, ctx_init=None, ctx_position='end', learned_cls=False, pubmedclip_path=None,
+                        merge=False, state_dict=None):
+        super(ClipTestTimeTuning, self).__init__()
+        clip, _, _ = load(arch, device=device, download_root=DOWNLOAD_ROOT)
+        if pubmedclip_path is not None:
+            ft_dict = torch.load(pubmedclip_path, map_location=f'cuda:{device}')
+            if merge:
+                print("Merging the weights of clip and state dict using WiSE-FT approach")
+                # WiSE-FT approach
+                merged_dict = {}
+                alpha = 0.50  # You can adjust this value as needed
+                for key in clip.state_dict().keys():
+                    merged_dict[key] = alpha * ft_dict[key] + (1 - alpha) * clip.state_dict()[key] # clip.load_state_dict(state_dict)
+                # Model Stock
+                # state_dict = stock_model(state_dict, clip.state_dict())
+            else:
+                merged_dict = ft_dict
+            clip.load_state_dict(merged_dict)
+        if state_dict is not None:
+            clip.load_state_dict(state_dict)
+        self.visual = clip.visual
+        self.text_encoder = TextEncoder(clip)
+        self.logit_scale = clip.logit_scale.data
+        # prompt tuning
+        self.prompt_learner = PromptLearner(clip, classnames, batch_size, n_ctx, ctx_init, ctx_position, learned_cls)
+        self.criterion = criterion
+        self.l2_norm_cal = False
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    # restore the initial state of the prompt_learner (tunable prompt)
+    def reset(self):
+        self.prompt_learner.reset()
+    def reset_classnames(self, classnames, arch):
+        self.prompt_learner.reset_classnames(classnames, arch)
+    def get_text_features(self, normalize=True):
+        text_features = []
+        prompts = self.prompt_learner()
+        tokenized_prompts = self.prompt_learner.tokenized_prompts
+        t_features = self.text_encoder(prompts, tokenized_prompts)
+        if normalize:
+            t_features = t_features / t_features.norm(dim=-1, keepdim=True)
+        text_features.append(t_features)
+        text_features = torch.stack(text_features, dim=0)
+        return torch.mean(text_features, dim=0)
+    def inference(self, image, return_logits=False, normalize=True):
+        with torch.no_grad():
+            image_features = self.visual(image.type(self.dtype))
+        # with torch.no_grad():
+        text_features = self.get_text_features(normalize=normalize)
+        if normalize:
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        #[c-tpt] --------------------------------------------
+        if self.l2_norm_cal:
+            prompt_mean = text_features.mean(0)
+            feature_distance = text_features - prompt_mean
+            l2_norm = torch.linalg.norm(feature_distance, dim=-1)
+            l2_norm_mean = l2_norm.mean()
+            #for saving to csv file
+            self.l2_norm_mean = l2_norm_mean.item()
+            #for training
+            self.l2_norm_mean_training = l2_norm_mean
+        #-----------------------------------------------------
+        logit_scale = self.logit_scale.exp()
+        logits = logit_scale * image_features @ text_features.t()
+        if return_logits:
+            return logits, image_features, text_features
+        return logits
+    def forward(self, input, return_logits=False, normalize=True):
+        if isinstance(input, Tuple):
+            view_0, view_1, view_2 = input
+            return self.contrast_prompt_tuning(view_0, view_1, view_2)
+        elif len(input.size()) == 2:
+            return self.directional_prompt_tuning(input)
+        else:
+            return self.inference(input, return_logits, normalize)
+def get_coop(clip_arch, test_set, device, n_ctx, ctx_init, classnames, learned_cls=False, pubmedclip_path=None, merge=False, state_dict=None):
+    # if test_set in fewshot_datasets:
+    #     classnames = eval("{}_classes".format(test_set.lower()))
+    # elif test_set == 'bongard':
+    #     if learned_cls:
+    #         classnames = ['X', 'X']
+    #     else:
+    #         classnames = ['True', 'False']
+    # else:
+    #     classnames = imagenet_classes
+    model = ClipTestTimeTuning(device, classnames, None, arch=clip_arch,
+                            n_ctx=n_ctx, ctx_init=ctx_init, learned_cls=learned_cls, pubmedclip_path=pubmedclip_path, merge=merge,
+                            state_dict=state_dict)
+    return model

clip/custom_medclip.py ADDED Viewed

	@@ -0,0 +1,389 @@

+import math
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import resnet50, ResNet
+from .clip import load, tokenize
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+from data.imagnet_prompts import imagenet_classes
+from data.fewshot_datasets import fewshot_datasets
+from data.cls_to_names import *
+# from data.medclip_datasets_clsnames import *
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+_tokenizer = _Tokenizer()
+DOWNLOAD_ROOT='~/.cache/clip'
+# class ClipImageEncoder(nn.Module):
+#     def __init__(self, device, arch="ViT-L/14", image_resolution=224, n_class=1000):
+#         super(ClipImageEncoder, self).__init__()
+#         clip, embed_dim, _ = load(arch, device=device, download_root=DOWNLOAD_ROOT)
+#         self.encoder = clip.visual
+#         del clip.transformer
+#         torch.cuda.empty_cache()
+#         self.cls_head = nn.Linear(embed_dim, n_class)
+#     @property
+#     def dtype(self):
+#         return self.encoder.conv1.weight.dtype
+#     def forward(self, image):
+#         x = self.encoder(image.type(self.dtype))
+#         output = self.cls_head(x)
+#         return output
+class TextEncoder(nn.Module):
+    def __init__(self, medclip_text_model):
+        super().__init__()
+        self.medclip_text_model = medclip_text_model
+    def forward(self, prompts_embeddings, tokenized_prompts):
+        output = self.medclip_text_model.model(inputs_embeds=prompts_embeddings, attention_mask=tokenized_prompts['attention_mask'])
+        # take the average of last four layers
+        # last_hidden_states = torch.stack(output['hidden_states'][-self.last_n_layer:]) # n_layer, batch, seqlen, emb_dim
+        # embed = last_hidden_states.permute(1,0,2,3)
+        # embed = embed.mean(1).mean(1) # pooling
+        # get 1+2+last layer
+        last_hidden_states = torch.stack([output['hidden_states'][1], output['hidden_states'][2], output['hidden_states'][-1]]) # n_layer, batch, seqlen, emb_dim
+        embed = last_hidden_states.permute(1,0,2,3).mean(2).mean(1) # pooling
+        # let's take only the last hidden layer
+        # embed = output['pooler_output']
+        embed = self.medclip_text_model.projection_head(embed)
+        return embed
+class PromptLearner(nn.Module):
+    def __init__(self, medclip_model, classnames, device, batch_size=None, n_ctx=16, ctx_init=None, ctx_position='end', learned_cls=False):
+        super().__init__()
+        n_cls = len(classnames)
+        self.learned_cls = learned_cls
+        dtype = medclip_model.dtype
+        self.dtype = dtype
+        ctx_dim = 768 # hardcoded for now!!! medclip_model.ln_final.weight.shape[0]
+        self.ctx_dim = ctx_dim
+        self.batch_size = batch_size
+        self.device = device
+        self.medclip_model = medclip_model
+        # self.ctx, prompt_prefix = self.reset_prompt(ctx_dim, ctx_init, medclip_model)
+        if ctx_init:
+            # raise NotImplementedError("This part is not yet implemented.")
+            # use given words to initialize context vectors
+            print("Initializing the contect with given words: [{}]".format(ctx_init))
+            # breakpoint()
+            ctx_init = ctx_init.replace("_", " ")
+            if '[CLS]' in ctx_init:
+                ctx_list = ctx_init.split(" ")
+                split_idx = ctx_list.index("[CLS]")
+                ctx_init = ctx_init.replace("[CLS] ", "")
+                ctx_position = "middle"
+            else:
+                split_idx = None
+            self.split_idx = split_idx
+            n_ctx = len(ctx_init.split(" "))
+            # prompt = tokenize(ctx_init).to(self.device)
+            prompt = ctx_init
+            tokenized_prompts = medclip_model.text_model.tokenizer(prompt, padding='max_length', max_length=25, truncation=True, return_tensors='pt').to(self.device)
+            prompts_tokens = tokenized_prompts['input_ids']  # [n_cls, 77]
+            with torch.no_grad():
+                embedding = medclip_model.text_model.model.embeddings.word_embeddings(prompts_tokens).type(dtype) # [n_cls, 77, 768]
+                # embedding = medclip_model.token_embedding(prompt).type(dtype)
+            ctx_vectors = embedding[0, 1 : 1 + n_ctx, :]
+            prompt_prefix = ctx_init
+        else:
+            print("Random initialization: initializing a generic context")
+            ctx_vectors = torch.empty(n_ctx, ctx_dim, dtype=dtype)
+            nn.init.normal_(ctx_vectors, std=0.02)
+            prompt_prefix = " ".join(["X"] * n_ctx)
+        self.prompt_prefix = prompt_prefix
+        print(f'Initial context: "{prompt_prefix}"')
+        print(f"Number of context words (tokens): {n_ctx}")
+        # batch-wise prompt tuning for test-time adaptation
+        if self.batch_size is not None:
+            ctx_vectors = ctx_vectors.repeat(batch_size, 1, 1)  #(N, L, D)
+        self.ctx_init_state = ctx_vectors.detach().clone()
+        self.ctx = nn.Parameter(ctx_vectors) # to be optimized
+        if not self.learned_cls:
+            classnames = [name.replace("_", " ") for name in classnames]
+            name_lens = [len(medclip_model.text_model.tokenizer.encode(name))-2 for name in classnames]   # [CLS] and [SEP] are not counted
+            prompts = [prompt_prefix + " " + name + "." for name in classnames]
+        else:
+            print("Random initialization: initializing a learnable class token")
+            cls_vectors = torch.empty(n_cls, 1, ctx_dim, dtype=dtype) # assume each learnable cls_token is only 1 word
+            nn.init.normal_(cls_vectors, std=0.02)
+            cls_token = "X"
+            name_lens = [1 for _ in classnames]
+            prompts = [prompt_prefix + " " + cls_token + "." for _ in classnames]
+            self.cls_init_state = cls_vectors.detach().clone()
+            self.cls = nn.Parameter(cls_vectors) # to be optimized
+        tokenized_prompts = medclip_model.text_model.tokenizer(prompts, padding='max_length', max_length=25, truncation=True, return_tensors='pt').to(self.device)
+        prompts_tokens = tokenized_prompts['input_ids']  # [n_cls, 77]
+        with torch.no_grad():
+            embedding = medclip_model.text_model.model.embeddings.word_embeddings(prompts_tokens).type(dtype) # [n_cls, 77, 768]
+        # These token vectors will be saved when in save_model(),
+        # but they should be ignored in load_model() as we want to use
+        # those computed using the current class names
+        self.register_buffer("token_prefix", embedding[:, :1, :])  # SOS
+        if self.learned_cls:
+            self.register_buffer("token_suffix", embedding[:, 1 + n_ctx + 1:, :])  # ..., EOS
+        else:
+            self.register_buffer("token_suffix", embedding[:, 1 + n_ctx :, :])  # CLS, EOS
+        self.ctx_init = ctx_init
+        self.tokenized_prompts = tokenized_prompts  # torch.Tensor
+        self.name_lens = name_lens
+        self.class_token_position = ctx_position
+        self.n_cls = n_cls
+        self.n_ctx = n_ctx
+        self.classnames = classnames
+    def reset(self):
+        ctx_vectors = self.ctx_init_state
+        self.ctx.copy_(ctx_vectors) # to be optimized
+        if self.learned_cls:
+            cls_vectors = self.cls_init_state
+            self.cls.copy_(cls_vectors)
+    def reset_classnames(self, classnames, arch):
+        self.n_cls = len(classnames)
+        if not self.learned_cls:
+            classnames = [name.replace("_", " ") for name in classnames]
+            name_lens = [len(self.medclip_model.text_model.tokenizer.encode(name))-2 for name in classnames]   # [CLS] and [SEP] are not counted
+            prompts = [self.prompt_prefix + " " + name + "." for name in classnames]
+        else:
+            cls_vectors = torch.empty(self.n_cls, 1, self.ctx_dim, dtype=self.dtype) # assume each learnable cls_token is only 1 word
+            nn.init.normal_(cls_vectors, std=0.02)
+            cls_token = "X"
+            name_lens = [1 for _ in classnames]
+            prompts = [self.prompt_prefix + " " + cls_token + "." for _ in classnames]
+            self.cls_init_state = cls_vectors.detach().clone()
+        tokenized_prompts = self.medclip_model.text_model.tokenizer(prompts, padding='max_length', max_length=25, truncation=True, return_tensors='pt').to(self.device)
+        prompts_tokens = tokenized_prompts['input_ids']
+        with torch.no_grad():
+            embedding = self.medclip_model.text_model.model.embeddings.word_embeddings(prompts_tokens).type(self.dtype) # [n_cls, 77, 768]
+        self.token_prefix = embedding[:, :1, :]
+        self.token_suffix = embedding[:, 1 + self.n_ctx :, :]  # CLS, EOS
+        self.name_lens = name_lens
+        self.tokenized_prompts = tokenized_prompts
+        self.classnames = classnames
+    def forward(self, init=None):
+        # the init will be used when computing CLIP directional loss
+        if init is not None:
+            ctx = init
+        else:
+            ctx = self.ctx
+        if ctx.dim() == 2:
+            ctx = ctx.unsqueeze(0).expand(self.n_cls, -1, -1)
+        elif not ctx.size()[0] == self.n_cls:
+            ctx = ctx.unsqueeze(1).expand(-1, self.n_cls, -1, -1)
+        prefix = self.token_prefix
+        suffix = self.token_suffix
+        if self.batch_size is not None:
+            # This way only works for single-gpu setting (could pass batch size as an argument for forward())
+            prefix = prefix.repeat(self.batch_size, 1, 1, 1)
+            suffix = suffix.repeat(self.batch_size, 1, 1, 1)
+        if self.learned_cls:
+            assert self.class_token_position == "end"
+        if self.class_token_position == "end":
+            if self.learned_cls:
+                cls = self.cls
+                prompts = torch.cat(
+                    [
+                        prefix,  # (n_cls, 1, dim)
+                        ctx,     # (n_cls, n_ctx, dim)
+                        cls,     # (n_cls, 1, dim)
+                        suffix,  # (n_cls, *, dim)
+                    ],
+                    dim=-2,
+                )
+            else:
+                prompts = torch.cat(
+                    [
+                        prefix,  # (n_cls, 1, dim)
+                        ctx,     # (n_cls, n_ctx, dim)
+                        suffix,  # (n_cls, *, dim)
+                    ],
+                    dim=-2,
+                )
+        elif self.class_token_position == "middle":
+            # TODO: to work with a batch of prompts
+            if self.split_idx is not None:
+                half_n_ctx = self.split_idx # split the ctx at the position of [CLS] in `ctx_init`
+            else:
+                half_n_ctx = self.n_ctx // 2
+            prompts = []
+            for i in range(self.n_cls):
+                name_len = self.name_lens[i]
+                prefix_i = prefix[i : i + 1, :, :]
+                class_i = suffix[i : i + 1, :name_len, :]
+                suffix_i = suffix[i : i + 1, name_len:, :]
+                ctx_i_half1 = ctx[i : i + 1, :half_n_ctx, :]
+                ctx_i_half2 = ctx[i : i + 1, half_n_ctx:, :]
+                prompt = torch.cat(
+                    [
+                        prefix_i,     # (1, 1, dim)
+                        ctx_i_half1,  # (1, n_ctx//2, dim)
+                        class_i,      # (1, name_len, dim)
+                        ctx_i_half2,  # (1, n_ctx//2, dim)
+                        suffix_i,     # (1, *, dim)
+                    ],
+                    dim=1,
+                )
+                prompts.append(prompt)
+            prompts = torch.cat(prompts, dim=0)
+        elif self.class_token_position == "front":
+            prompts = []
+            for i in range(self.n_cls):
+                name_len = self.name_lens[i]
+                prefix_i = prefix[i : i + 1, :, :]
+                class_i = suffix[i : i + 1, :name_len, :]
+                suffix_i = suffix[i : i + 1, name_len:, :]
+                ctx_i = ctx[i : i + 1, :, :]
+                prompt = torch.cat(
+                    [
+                        prefix_i,  # (1, 1, dim)
+                        class_i,   # (1, name_len, dim)
+                        ctx_i,     # (1, n_ctx, dim)
+                        suffix_i,  # (1, *, dim)
+                    ],
+                    dim=1,
+                )
+                prompts.append(prompt)
+            prompts = torch.cat(prompts, dim=0)
+        else:
+            raise ValueError
+        return prompts
+from MedCLIP.medclip import MedCLIPModel, MedCLIPVisionModel, MedCLIPVisionModelViT
+from MedCLIP.medclip import MedCLIPProcessor
+def load_medclip_to_cpu():
+    model = MedCLIPModel(vision_cls=MedCLIPVisionModelViT)
+    model.from_pretrained()
+    # breakpoint()
+    # model.from_pretrained("/l/users/asif.hanif/pre-trained-models/vlps/medclip/pretrained/medclip-vit/")
+    model.from_pretrained("./MedCLIP/pretrained/medclip-vit/")
+    # for vit
+    model.dtype = model.vision_model.model.embeddings.patch_embeddings.projection.weight.dtype
+    # for Resnet
+    # model.dtype = model.vision_model.model.conv1.weight.dtype
+    model.eval()
+    return model
+class ClipTestTimeTuning(nn.Module):
+    def __init__(self, device, classnames, batch_size, criterion='cosine', arch="ViT-L/14",
+                        n_ctx=16, ctx_init=None, ctx_position='end', learned_cls=False):
+        super(ClipTestTimeTuning, self).__init__()
+        self.device = device
+        self.medclip_model = load_medclip_to_cpu()
+        self.dtype = self.medclip_model.dtype
+        self.medclip_model = self.medclip_model.to(self.device)
+        self.image_encoder = self.medclip_model.vision_model
+        self.text_encoder = TextEncoder(self.medclip_model.text_model)
+        self.logit_scale = self.medclip_model.logit_scale.data
+        # prompt tuning
+        self.prompt_learner = PromptLearner(self.medclip_model, classnames, self.device, batch_size, n_ctx, ctx_init, ctx_position, learned_cls)
+        self.criterion = criterion
+        self.l2_norm_cal = False
+    # @property
+    # def dtype(self):
+    #     return self.image_encoder.conv1.weight.dtype
+    # restore the initial state of the prompt_learner (tunable prompt)
+    def reset(self):
+        self.prompt_learner.reset()
+    def reset_classnames(self, classnames, arch):
+        self.prompt_learner.reset_classnames(classnames, arch)
+    def get_text_features(self):
+        text_features = []
+        prompts = self.prompt_learner()
+        tokenized_prompts = self.prompt_learner.tokenized_prompts
+        t_features = self.text_encoder(prompts, tokenized_prompts)
+        text_features.append(t_features / t_features.norm(dim=-1, keepdim=True))
+        text_features = torch.stack(text_features, dim=0)
+        return torch.mean(text_features, dim=0)
+    def inference(self, image):
+        with torch.no_grad():
+            image_features = self.image_encoder(image.type(self.dtype))
+        text_features = self.get_text_features()
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        #[c-tpt] --------------------------------------------
+        if self.l2_norm_cal:
+            prompt_mean = text_features.mean(0)
+            feature_distance = text_features - prompt_mean
+            l2_norm = torch.linalg.norm(feature_distance, dim=-1)
+            l2_norm_mean = l2_norm.mean()
+            #for saving to csv file
+            self.l2_norm_mean = l2_norm_mean.item()
+            #for training
+            self.l2_norm_mean_training = l2_norm_mean
+        #-----------------------------------------------------
+        logit_scale = self.logit_scale.exp()
+        logits = logit_scale * image_features @ text_features.t()
+        return logits
+    def forward(self, input):
+        # breakpoint()
+        if isinstance(input, Tuple):
+            view_0, view_1, view_2 = input
+            return self.contrast_prompt_tuning(view_0, view_1, view_2)
+        elif len(input.size()) == 2:
+            return self.directional_prompt_tuning(input)
+        else:
+            return self.inference(input)
+def get_coop(clip_arch, test_set, device, n_ctx, ctx_init=None, learned_cls=False):
+    classnames = eval("{}_classes".format(test_set.lower()))
+    model = ClipTestTimeTuning(device, classnames, None, arch=clip_arch,
+                            n_ctx=n_ctx, ctx_init=ctx_init, learned_cls=learned_cls)
+    return model

clip/model.py ADDED Viewed

	@@ -0,0 +1,438 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu3(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisionTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    # convert_weights(model)
+    model.load_state_dict(state_dict)
+    del state_dict
+    torch.cuda.empty_cache()
+    return model.eval()

clip/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

data/__init__.py ADDED Viewed

File without changes

data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (146 Bytes). View file

data/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (162 Bytes). View file

data/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (150 Bytes). View file

data/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (144 Bytes). View file

data/__pycache__/augmix_ops.cpython-310.pyc ADDED Viewed

Binary file (3.69 kB). View file

data/__pycache__/augmix_ops.cpython-311.pyc ADDED Viewed

Binary file (6.76 kB). View file

data/__pycache__/augmix_ops.cpython-312.pyc ADDED Viewed

Binary file (6.24 kB). View file

data/__pycache__/augmix_ops.cpython-39.pyc ADDED Viewed

Binary file (3.88 kB). View file

data/__pycache__/cls_to_names.cpython-310.pyc ADDED Viewed

Binary file (23.9 kB). View file

data/__pycache__/cls_to_names.cpython-312.pyc ADDED Viewed

Binary file (26.6 kB). View file

data/__pycache__/cls_to_names.cpython-39.pyc ADDED Viewed

Binary file (19.9 kB). View file