ADD: Filter Duplicate images out

This commit is contained in:
sebastian 2026-05-08 18:02:51 +02:00
parent cc301ce395
commit 8edc250058

View File

@ -1,13 +1,15 @@
# quick_labeler.py
import random
import shutil, os
import shutil
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import open_clip
SOURCE = Path("alle_meine_fotos/")
images = list(SOURCE.glob("**/*.jpg")) + list(SOURCE.glob("**/*.png"))
THRESHOLD = 0.95
DIRS = [
"dataset/train/wallpaper", "dataset/train/no_wallpaper",
@ -16,7 +18,50 @@ DIRS = [
for d in DIRS:
Path(d).mkdir(parents=True, exist_ok=True)
for img_path in images:
# --- Embeddings berechnen ---
print("Lade CLIP Modell...")
clip_model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
clip_model.eval()
def get_embedding(img_path):
try:
img = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0)
with torch.no_grad():
return clip_model.encode_image(img).numpy().flatten()
except Exception as e:
print(f" Fehler bei {img_path.name}: {e}")
return None
all_images = list(SOURCE.glob("**/*.jpg")) + list(SOURCE.glob("**/*.png"))
print(f"{len(all_images)} Bilder gefunden, berechne Embeddings...")
embeddings, valid_images = [], []
for i, p in enumerate(all_images):
print(f" {i+1}/{len(all_images)}: {p.name}", end="\r")
emb = get_embedding(p)
if emb is not None:
embeddings.append(emb)
valid_images.append(p)
# --- Duplikate rausfiltern ---
print(f"\nFiltere Duplikate (Threshold={THRESHOLD})...")
sim_matrix = cosine_similarity(embeddings)
to_skip = set()
for i in range(len(valid_images)):
if i in to_skip:
continue
for j in range(i + 1, len(valid_images)):
if sim_matrix[i][j] > THRESHOLD:
to_skip.add(j)
unique_images = [p for i, p in enumerate(valid_images) if i not in to_skip]
print(f"{len(valid_images) - len(unique_images)} Duplikate entfernt, {len(unique_images)} verbleiben.\n")
# Modell kann jetzt freigegeben werden
del clip_model, embeddings, sim_matrix
# --- Labelschleife ---
for img_path in unique_images:
img = Image.open(img_path)
plt.imshow(img)
plt.title(img_path.name)
@ -31,4 +76,4 @@ for img_path in images:
elif label in ("y", "n"):
folder = "wallpaper" if label == "y" else "no_wallpaper"
split = "train" if random.random() < 0.8 else "val"
shutil.copy(img_path, f"dataset/{split}/{folder}/{img_path.name}")
shutil.copy(img_path, f"dataset/{split}/{folder}/{img_path.name}")