%%capture
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer, util
from PIL import Image
import glob
import torch
import pickle
import zipfile
from IPython.display import display
from IPython.display import Image as IPImage
import os
from tqdm.autonotebook import tqdm
# Here we load the multilingual CLIP model. Note, this model can only encode text.
# If you need embeddings for images, you must load the 'clip-ViT-B-32' model
= SentenceTransformer('clip-ViT-B-32-multilingual-v1') model
# Next, we get about 25k images from Unsplash
= 'photos/'
img_folder if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
=True)
os.makedirs(img_folder, exist_ok
= 'unsplash-25k-photos.zip'
photo_filename if not os.path.exists(photo_filename): #Download dataset if does not exist
'http://sbert.net/datasets/'+photo_filename, photo_filename)
util.http_get(
#Extract all images
with zipfile.ZipFile(photo_filename, 'r') as zf:
for member in tqdm(zf.infolist(), desc='Extracting'):
zf.extract(member, img_folder)
# Now, we need to compute the embeddings
# To speed things up, we destribute pre-computed embeddings
# Otherwise you can also encode the images yourself.
# To encode an image, you can use the following code:
# from PIL import Image
# img_emb = model.encode(Image.open(filepath))
= True
use_precomputed_embeddings
if use_precomputed_embeddings:
= 'unsplash-25k-photos-embeddings.pkl'
emb_filename if not os.path.exists(emb_filename): #Download dataset if does not exist
'http://sbert.net/datasets/'+emb_filename, emb_filename)
util.http_get(
with open(emb_filename, 'rb') as fIn:
= pickle.load(fIn)
img_names, img_emb print("Images:", len(img_names))
else:
#For embedding images, we need the non-multilingual CLIP model
= SentenceTransformer('clip-ViT-B-32')
img_model
= list(glob.glob('photos/*.jpg'))
img_names print("Images:", len(img_names))
= img_model.encode([Image.open(filepath) for filepath in img_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True) img_emb
Images: 24996
import torch
= 'photos/'+img_names[0]
filepath = torch.tensor(img_emb[0])
one_emb = SentenceTransformer('clip-ViT-B-32')
img_model = img_model.encode(Image.open(filepath), convert_to_tensor=True).cpu()
comb_emb print(util.cos_sim(one_emb, comb_emb))
ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.
/usr/local/lib/python3.7/dist-packages/transformers/feature_extraction_utils.py:158: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at ../torch/csrc/utils/tensor_new.cpp:201.)
tensor = as_tensor(value)
tensor([[1.0000]])
# Next, we define a search function.
def search(query, k=3):
# First, we encode the query (which can either be an image or a text string)
= model.encode([query], convert_to_tensor=True, show_progress_bar=False)
query_emb
# Then, we use the util.semantic_search function, which computes the cosine-similarity
# between the query embedding and all image embeddings.
# It then returns the top_k highest ranked images, which we output
= util.semantic_search(query_emb, img_emb, top_k=k)[0]
hits
print("Query:")
display(query)for hit in hits:
print(img_names[hit['corpus_id']])
'corpus_id']]), width=200)) display(IPImage(os.path.join(img_folder, img_names[hit[
"Two dogs playing in the snow") search(
Query:
'Two dogs playing in the snow'
lyStEjlKNSw.jpg
FAcSe7SjDUU.jpg
Hb6nGDgWztE.jpg
#German: A cat on a chair
"Eine Katze auf einem Stuhl") search(
Query:
'Eine Katze auf einem Stuhl'
CgGDzMYdYw8.jpg
kjERLXaHjXc.jpg
I-YJ-gaJNaw.jpg
#Spanish: Many fish
"Muchos peces") search(
Query:
'Muchos peces'
H22jcGTyrS4.jpg
CJ_9I6aXSnc.jpg
_MJKaRig1Ic.jpg
#Chinese: A beach with palm trees
"棕榈树的沙滩") search(
Query:
'棕榈树的沙滩'
crIXKhUDpBI.jpg
_6iV1AJZ53s.jpg
rv63du1a79E.jpg
#Russian: A sunset on the beach
"Закат на пляже") search(
Query:
'Закат на пляже'
JC5U3Eyiyr4.jpg
5z1QDcisnJ8.jpg
rdG4hRoyVR0.jpg
#Turkish: A dog in a park
"Parkta bir köpek") search(
Query:
'Parkta bir köpek'
ROJLfAbL1Ig.jpg
0O9A0F_d1qA.jpg
4mdsPUtN0P0.jpg
# Japanese: New York at night
"夜のニューヨーク") search(
Query:
'夜のニューヨーク'
FGjR4IGwP7U.jpg
8nCMOFYyXF4.jpg
ZAOEjcpdMkc.jpg
# Portuguese: Two dog
"Dois cachorro") search(
Query:
'Dois cachorro'
kFucQoKaQ3g.jpg
aPtPQFyLxMM.jpg
oAGoeMbr1-4.jpg