Multilingual Joint Image & Text Embeddings

%%capture
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import torch
import pickle
import zipfile
from IPython.display import display
from IPython.display import Image as IPImage
import os
from tqdm.autonotebook import tqdm

# Here we load the multilingual CLIP model. Note, this model can only encode text.
# If you need embeddings for images, you must load the 'clip-ViT-B-32' model
model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')

# Next, we get about 25k images from Unsplash 
img_folder = 'photos/'
if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
    os.makedirs(img_folder, exist_ok=True)
    
    photo_filename = 'unsplash-25k-photos.zip'
    if not os.path.exists(photo_filename):   #Download dataset if does not exist
        util.http_get('http://sbert.net/datasets/'+photo_filename, photo_filename)
        
    #Extract all images
    with zipfile.ZipFile(photo_filename, 'r') as zf:
        for member in tqdm(zf.infolist(), desc='Extracting'):
            zf.extract(member, img_folder)

# Now, we need to compute the embeddings
# To speed things up, we destribute pre-computed embeddings
# Otherwise you can also encode the images yourself.
# To encode an image, you can use the following code:
# from PIL import Image
# img_emb = model.encode(Image.open(filepath))

use_precomputed_embeddings = True

if use_precomputed_embeddings: 
    emb_filename = 'unsplash-25k-photos-embeddings.pkl'
    if not os.path.exists(emb_filename):   #Download dataset if does not exist
        util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename)
        
    with open(emb_filename, 'rb') as fIn:
        img_names, img_emb = pickle.load(fIn)  
    print("Images:", len(img_names))
else:
    #For embedding images, we need the non-multilingual CLIP model
    img_model = SentenceTransformer('clip-ViT-B-32')

    img_names = list(glob.glob('photos/*.jpg'))
    print("Images:", len(img_names))
    img_emb = img_model.encode([Image.open(filepath) for filepath in img_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

Images: 24996

import torch
filepath = 'photos/'+img_names[0]
one_emb = torch.tensor(img_emb[0])
img_model = SentenceTransformer('clip-ViT-B-32')
comb_emb = img_model.encode(Image.open(filepath), convert_to_tensor=True).cpu()
print(util.cos_sim(one_emb, comb_emb))

ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py:158: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at  ../torch/csrc/utils/tensor_new.cpp:201.)
  tensor = as_tensor(value)

tensor([[1.0000]])

# Next, we define a search function.
def search(query, k=3):
    # First, we encode the query (which can either be an image or a text string)
    query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
    
    # Then, we use the util.semantic_search function, which computes the cosine-similarity
    # between the query embedding and all image embeddings.
    # It then returns the top_k highest ranked images, which we output
    hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]
    
    print("Query:")
    display(query)
    for hit in hits:
        print(img_names[hit['corpus_id']])
        display(IPImage(os.path.join(img_folder, img_names[hit['corpus_id']]), width=200))

search("Two dogs playing in the snow")

Query:

'Two dogs playing in the snow'

lyStEjlKNSw.jpg

FAcSe7SjDUU.jpg
Hb6nGDgWztE.jpg

#German: A cat on a chair
search("Eine Katze auf einem Stuhl")

Query:

'Eine Katze auf einem Stuhl'

CgGDzMYdYw8.jpg

kjERLXaHjXc.jpg

I-YJ-gaJNaw.jpg

#Spanish: Many fish
search("Muchos peces")

Query:

'Muchos peces'

H22jcGTyrS4.jpg

CJ_9I6aXSnc.jpg

_MJKaRig1Ic.jpg

#Chinese: A beach with palm trees
search("棕榈树的沙滩")

Query:

'棕榈树的沙滩'

crIXKhUDpBI.jpg

_6iV1AJZ53s.jpg

rv63du1a79E.jpg

#Russian: A sunset on the beach
search("Закат на пляже")

Query:

'Закат на пляже'

JC5U3Eyiyr4.jpg

5z1QDcisnJ8.jpg

rdG4hRoyVR0.jpg

#Turkish: A dog in a park
search("Parkta bir köpek")

Query:

'Parkta bir köpek'

ROJLfAbL1Ig.jpg

0O9A0F_d1qA.jpg

4mdsPUtN0P0.jpg

# Japanese: New York at night
search("夜のニューヨーク")

Query:

'夜のニューヨーク'

FGjR4IGwP7U.jpg

8nCMOFYyXF4.jpg

ZAOEjcpdMkc.jpg

# Portuguese: Two dog
search("Dois cachorro")

Query:

'Dois cachorro'

kFucQoKaQ3g.jpg

aPtPQFyLxMM.jpg

oAGoeMbr1-4.jpg