From 8edc250058427e075244673b5b7beefeba746952 Mon Sep 17 00:00:00 2001 From: sebastian Date: Fri, 8 May 2026 18:02:51 +0200 Subject: [PATCH] ADD: Filter Duplicate images out --- classifier-training/manual-classifier.py | 57 +++++++++++++++++++++--- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/classifier-training/manual-classifier.py b/classifier-training/manual-classifier.py index 51903b2..b0b4a50 100644 --- a/classifier-training/manual-classifier.py +++ b/classifier-training/manual-classifier.py @@ -1,13 +1,15 @@ -# quick_labeler.py import random -import shutil, os +import shutil from pathlib import Path from PIL import Image import matplotlib.pyplot as plt +import torch +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity +import open_clip SOURCE = Path("alle_meine_fotos/") -images = list(SOURCE.glob("**/*.jpg")) + list(SOURCE.glob("**/*.png")) - +THRESHOLD = 0.95 DIRS = [ "dataset/train/wallpaper", "dataset/train/no_wallpaper", @@ -16,7 +18,50 @@ DIRS = [ for d in DIRS: Path(d).mkdir(parents=True, exist_ok=True) -for img_path in images: +# --- Embeddings berechnen --- +print("Lade CLIP Modell...") +clip_model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai") +clip_model.eval() + +def get_embedding(img_path): + try: + img = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0) + with torch.no_grad(): + return clip_model.encode_image(img).numpy().flatten() + except Exception as e: + print(f" Fehler bei {img_path.name}: {e}") + return None + +all_images = list(SOURCE.glob("**/*.jpg")) + list(SOURCE.glob("**/*.png")) + +print(f"{len(all_images)} Bilder gefunden, berechne Embeddings...") +embeddings, valid_images = [], [] +for i, p in enumerate(all_images): + print(f" {i+1}/{len(all_images)}: {p.name}", end="\r") + emb = get_embedding(p) + if emb is not None: + embeddings.append(emb) + valid_images.append(p) + +# --- Duplikate rausfiltern --- +print(f"\nFiltere Duplikate (Threshold={THRESHOLD})...") +sim_matrix = cosine_similarity(embeddings) +to_skip = set() +for i in range(len(valid_images)): + if i in to_skip: + continue + for j in range(i + 1, len(valid_images)): + if sim_matrix[i][j] > THRESHOLD: + to_skip.add(j) + +unique_images = [p for i, p in enumerate(valid_images) if i not in to_skip] +print(f"{len(valid_images) - len(unique_images)} Duplikate entfernt, {len(unique_images)} verbleiben.\n") + +# Modell kann jetzt freigegeben werden +del clip_model, embeddings, sim_matrix + +# --- Labelschleife --- +for img_path in unique_images: img = Image.open(img_path) plt.imshow(img) plt.title(img_path.name) @@ -31,4 +76,4 @@ for img_path in images: elif label in ("y", "n"): folder = "wallpaper" if label == "y" else "no_wallpaper" split = "train" if random.random() < 0.8 else "val" - shutil.copy(img_path, f"dataset/{split}/{folder}/{img_path.name}") + shutil.copy(img_path, f"dataset/{split}/{folder}/{img_path.name}") \ No newline at end of file