ftfy or spacy is not installed using BERT BasicTokenizer instead of ftfy.
# Next, we get about 25k images from Unsplash img_folder ='photos/'ifnot os.path.exists(img_folder) orlen(os.listdir(img_folder)) ==0: os.makedirs(img_folder, exist_ok=True) photo_filename ='unsplash-25k-photos.zip'ifnot os.path.exists(photo_filename): #Download dataset if does not exist util.http_get('http://sbert.net/datasets/'+photo_filename, photo_filename)#Extract all imageswith zipfile.ZipFile(photo_filename, 'r') as zf:for member in tqdm(zf.infolist(), desc='Extracting'): zf.extract(member, img_folder)
# Now, we need to compute the embeddings# To speed things up, we destribute pre-computed embeddings# Otherwise you can also encode the images yourself.# To encode an image, you can use the following code:# from PIL import Image# img_emb = model.encode(Image.open(filepath))use_precomputed_embeddings =Trueif use_precomputed_embeddings: emb_filename ='unsplash-25k-photos-embeddings.pkl'ifnot os.path.exists(emb_filename): #Download dataset if does not exist util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename)withopen(emb_filename, 'rb') as fIn: img_names, img_emb = pickle.load(fIn) print("Images:", len(img_names))else: img_names =list(glob.glob('unsplash/photos/*.jpg'))print("Images:", len(img_names)) img_emb = model.encode([Image.open(filepath) for filepath in img_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)
Images: 24996
# Next, we define a search function.def search(query, k=3):# First, we encode the query (which can either be an image or a text string) query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)# Then, we use the util.semantic_search function, which computes the cosine-similarity# between the query embedding and all image embeddings.# It then returns the top_k highest ranked images, which we output hits = util.semantic_search(query_emb, img_emb, top_k=k)[0]print("Query:") display(query)for hit in hits:print(img_names[hit['corpus_id']]) display(IPImage(os.path.join(img_folder, img_names[hit['corpus_id']]), width=200))
search("Two cats playing on the street")
Query:
'Two cats playing on the street'
4mA9_5vbZ_s.jpg
w6tMRf7kGLA.jpg
n4pNuXxyIr4.jpg
search("A sunset on the montain")
Query:
'A sunset on the montain'
Zf4jpcGEinM.jpg
G5JDRSKi3uY.jpg
ig9yVlj5YYg.jpg
search("Oslo")
Query:
'Oslo'
uHsQou9tWTQ.jpg
0ABCZ9bTsw4.jpg
d5_hjWQ4NwA.jpg
search("A dog in a park")
Query:
'A dog in a park'
IVyZrLp41D0.jpg
0O9A0F_d1qA.jpg
KVeogBZzl4M.jpg
search("A beach with palm trees")
Query:
'A beach with palm trees'
7rrgPPljqYU.jpg
kmihWgpbDEg.jpg
ZyfOq52b0cs.jpg
Image-to-Image Search
You can use the method also for image-to-image search.
To achieve this, you pass Image.open('path/to/image.jpg') to the search method.