Loading...

Building an AI-Powered Image Search Engine with Ollama, Gemini, and Microsoft SemanticKernel

Takeaways

Skip to the complete code example:

using System.Diagnostics;
using Google.GenAI;
using Google.GenAI.Types;
using Microsoft.Extensions.AI;
using Microsoft.Extensions.VectorData;
using Microsoft.SemanticKernel.Connectors.InMemory;
using OllamaSharp;
using File = System.IO.File;

namespace ImageSearchApp
{
    public class GalleryItem
    {
        [VectorStoreKey]
        public int Id { get; set; }

        [VectorStoreData]
        public string FilePath { get; set; } = string.Empty;

        [VectorStoreData]
        public string Description { get; set; } = string.Empty;

        // Matches mxbai-embed-large (1024 dimensions)
        [VectorStoreVector(Dimensions: 1024, DistanceFunction = DistanceFunction.CosineSimilarity)]
        public ReadOnlyMemory Vector { get; set; }
    }

    internal class Program
    {
        private static readonly string FolderPath = @"C:\";
        private static readonly string ApiKey = "GEMINI API KEY";

        static async Task Main(string[] args)
        {
            Console.WriteLine("Starting Image Indexing with Gemini 2.5 API...");

            // 1. Setup Clients
            var client = new Client(apiKey: ApiKey);
            var embeddingGenerator = new OllamaApiClient(new Uri("http://localhost:11434/"), "mxbai-embed-large");

            // 2. Create the VectorStoreCollection
            var vectorStore = new InMemoryVectorStore();
            VectorStoreCollection collection = vectorStore.GetCollection("gallery");
            await collection.EnsureCollectionExistsAsync();

            // 3. Index Local Images
            await IndexImagesAsync(client, embeddingGenerator, collection);

            // 4. Interactive Search
            await RunSearchLoop(embeddingGenerator, collection);
        }

        static async Task IndexImagesAsync(Client client, IEmbeddingGenerator> generator, VectorStoreCollection collection)
        {
            var files = Directory.GetFiles(FolderPath, "*.*", SearchOption.AllDirectories)
                .Where(f => new[] { ".jpg", ".jpeg", ".png" }.Contains(Path.GetExtension(f).ToLower())).ToList();

            int id = 0;
            foreach (var file in files)
            {
                try
                {
                    Console.WriteLine($"Analyzing {Path.GetFileName(file)}...");
                    var bytes = await File.ReadAllBytesAsync(file);

                    var response = await client.Models.GenerateContentAsync(
                        model: "gemini-2.5-flash",
                        contents: new List {
                            new Content {
                                Parts = new List {
                                    new Part { Text = "Describe this image in detail. Return JSON with 'description', 'people', 'clothing', and 'scenery'." },
                                    new Part { InlineData = new Blob { MimeType = "image/jpeg", Data = bytes } }
                                }
                            }
                        },
                        config: new GenerateContentConfig { ResponseMimeType = "application/json" }
                    );

                    string description = response.Candidates?[0].Content?.Parts?[0].Text ?? "{}";
                    var vector = await generator.GenerateVectorAsync(description);

                    await collection.UpsertAsync(new GalleryItem
                    {
                        Id = id++,
                        FilePath = file,
                        Description = description,
                        Vector = vector
                    });

                    Console.WriteLine("Done");
                }
                catch (Exception ex) { Console.WriteLine($"{ex.Message}"); }
            }
        }

        static async Task RunSearchLoop(IEmbeddingGenerator> generator, VectorStoreCollection collection)
        {
            while (true)
            {
                Console.WriteLine("Describe the photo you want (or 'exit'):");
                var query = Console.ReadLine();
                if (string.IsNullOrEmpty(query) || query == "exit") break;

                var queryVector = await generator.GenerateVectorAsync(query);
                var options = new VectorSearchOptions { IncludeVectors = false };
                var searchResults = collection.SearchAsync(queryVector, 10, options);

                var hits = new List();
                int i = 1;

                Console.WriteLine("Matches:");
                // Iterating via the .Results property
                await foreach (var result in searchResults)
                {
                    if (result.Score > 0.6)
                    {
                        hits.Add(result.Record);
                        Console.WriteLine($"{i}. {Path.GetFileName(result.Record.FilePath)} [Score: {result.Score:F2}]");
                        i++;
                    }
                }

                if (hits.Count == 0) { Console.WriteLine("No matches found."); continue; }

                Console.WriteLine("\nSelect 1-10 to open, or 'n' for next search:");
                var choice = Console.ReadLine();
                if (int.TryParse(choice, out int sel) && sel >= 1 && sel <= hits.Count)
                {
                    Process.Start(new ProcessStartInfo(hits[sel - 1].FilePath) { UseShellExecute = true });
                }
            }
        }
    }
}
Posted: 1/26/2026
Avatar
Author
Gal Ratner

Introduction: The Problem with Traditional Image Search

We've all been there. You have thousands of photos scattered across your hard drive, and you're trying to find that one specific picture. You remember it was taken at a beach, there was a person wearing a red jacket, but you can't remember the filename or exactly when you took it. You end up scrolling through hundreds of images, hoping you'll recognize it when you see it.

Traditional file systems organize images by name, date, or folder structure. But our memories don't work that way. We remember images by their content: "the photo of my friend at the coffee shop" or "that sunset picture with the mountains." What if you could search your photo library the same way you remember it—by describing what's actually in the image?

In this post, I'll show you how to build a powerful AI-powered image search engine using C# that lets you search through your photos using natural language queries. We'll combine three powerful technologies: Google's Gemini API for image understanding, Ollama for local embedding generation, and Microsoft's SemanticKernel library for vector storage and similarity search.

What We're Building

Our image search engine will:

  1. Index your local images by analyzing their content using Google's Gemini 2.5 Flash API
  2. Generate semantic embeddings locally using Ollama's embedding models
  3. Store and search vectors efficiently using Microsoft SemanticKernel's vector store
  4. Enable natural language queries like "person wearing sunglasses on a beach" or "group photo at a restaurant"
  5. Return ranked results based on semantic similarity, with the most relevant images first

The best part? Once indexed, searching is lightning-fast and runs entirely locally on your machine—no API calls needed for queries.

Technology Stack Overview

Before diving into the code, let's understand the three key technologies we're leveraging:

Google Gemini API: Vision-to-Text Transformation

Google's Gemini 2.5 Flash is a multimodal AI model that can "see" and understand images. We'll use it to analyze each photo and generate detailed textual descriptions. The API can identify objects, people, actions, scenery, clothing, and much more. This transforms our visual data into structured text that we can work with programmatically.

Why Gemini? It offers excellent image understanding capabilities at competitive pricing, with a generous free tier that's perfect for personal projects. The 2.5 Flash variant is optimized for speed, making it ideal for bulk image processing.

Ollama: Local Embedding Generation

Ollama allows you to run large language models locally on your machine. We're specifically using it to run the mxbai-embed-large model, which generates 1024-dimensional vector embeddings from text.

But what's an embedding? Think of it as a mathematical representation of meaning. Text with similar meanings will have similar embeddings (vectors that are close together in high-dimensional space). This is the secret sauce that makes semantic search possible.

Running embeddings locally with Ollama means:

  • Privacy: Your search queries never leave your machine
  • Speed: No network latency for searches
  • Cost: Zero API costs after initial indexing
  • Offline capability: Search works without internet connection

Microsoft SemanticKernel: Vector Storage and Search

Microsoft SemanticKernel (specifically the Microsoft.Extensions.VectorData components) provides abstractions for working with vector databases. We're using the InMemoryVectorStore, which keeps everything in RAM for blazing-fast searches.

The library handles:

  • Vector storage with automatic indexing
  • Similarity search using cosine distance
  • Metadata management (file paths, descriptions)
  • Collection management and schema definition

For a production application, you could swap the in-memory store for persistent options like Azure AI Search, Qdrant, or Postgres with pgvector, all without changing your core code.

Prerequisites

Before we start coding, you'll need to set up the following:

1. Install Ollama

Download and install Ollama from ollama.ai. After installation, pull the embedding model:

ollama pull mxbai-embed-large

Verify it's running:

ollama list

2. Get a Gemini API Key

  1. Visit Google AI Studio
  2. Sign in with your Google account
  3. Click "Create API Key"
  4. Copy your API key (you'll need it in the code)

The free tier includes 1,500 requests per day, which is plenty for personal use.

3. Install NuGet Packages

Create a new C# console application and install these packages:

dotnet add package Google.GenAI
dotnet add package OllamaSharp
dotnet add package Microsoft.Extensions.AI
dotnet add package Microsoft.Extensions.VectorData
dotnet add package Microsoft.SemanticKernel.Connectors.InMemory

The Complete Implementation

Now let's walk through the code step by step.

Defining the Data Model

First, we define a GalleryItem class that represents each indexed image:

public class GalleryItem
{
    [VectorStoreKey]
    public int Id { get; set; }

    [VectorStoreData]
    public string FilePath { get; set; } = string.Empty;

    [VectorStoreData]
    public string Description { get; set; } = string.Empty;

    [VectorStoreVector(Dimensions: 1024, DistanceFunction = DistanceFunction.CosineSimilarity)]
    public ReadOnlyMemory<float> Vector { get; set; }
}

The attributes are crucial here:

  • [VectorStoreKey]: Marks Id as the unique identifier for each record
  • [VectorStoreData]: Indicates fields that store metadata (file path and description)
  • [VectorStoreVector]: Defines the vector field with its dimensionality (1024, matching our embedding model) and the distance function for similarity calculations

The DistanceFunction.CosineSimilarity is particularly important. Cosine similarity measures the angle between vectors rather than their absolute distance, making it ideal for semantic similarity where the direction (meaning) matters more than magnitude.

Initializing the Components

In our Main method, we set up the three key components:

static async Task Main(string[] args)
{
    Console.WriteLine("Starting Image Indexing with Gemini 2.5 API...");

    // 1. Setup Clients
    var client = new Client(apiKey: ApiKey);
    var embeddingGenerator = new OllamaApiClient(
        new Uri("http://localhost:11434/"), 
        "mxbai-embed-large"
    );

    // 2. Create the VectorStoreCollection
    var vectorStore = new InMemoryVectorStore();
    VectorStoreCollection<int, GalleryItem> collection = 
        vectorStore.GetCollection<int, GalleryItem>("gallery");
    await collection.EnsureCollectionExistsAsync();

    // 3. Index Local Images
    await IndexImagesAsync(client, embeddingGenerator, collection);

    // 4. Interactive Search
    await RunSearchLoop(embeddingGenerator, collection);
}

The workflow is straightforward:

  1. Create the Gemini API client
  2. Create the Ollama embedding client (pointing to your local Ollama instance)
  3. Set up an in-memory vector store and create a "gallery" collection
  4. Index all images in the specified folder
  5. Enter an interactive search loop

The Indexing Process: Where the Magic Happens

The IndexImagesAsync method is where we transform images into searchable vectors:

static async Task IndexImagesAsync(
    Client client, 
    IEmbeddingGenerator<string, Embedding<float>> generator, 
    VectorStoreCollection<int, GalleryItem> collection)
{
    var files = Directory.GetFiles(FolderPath, "*.*", SearchOption.AllDirectories)
        .Where(f => new[] { ".jpg", ".jpeg", ".png" }
            .Contains(Path.GetExtension(f).ToLower()))
        .ToList();

    int id = 0;
    foreach (var file in files)
    {
        try
        {
            Console.WriteLine($"Analyzing {Path.GetFileName(file)}...");
            var bytes = await File.ReadAllBytesAsync(file);

            var response = await client.Models.GenerateContentAsync(
                model: "gemini-2.5-flash",
                contents: new List<Content> {
                    new Content {
                        Parts = new List<Part> {
                            new Part { 
                                Text = "Describe this image in detail. Return JSON with 'description', 'people', 'clothing', and 'scenery'." 
                            },
                            new Part { 
                                InlineData = new Blob { 
                                    MimeType = "image/jpeg", 
                                    Data = bytes 
                                } 
                            }
                        }
                    }
                },
                config: new GenerateContentConfig { 
                    ResponseMimeType = "application/json" 
                }
            );

            string description = response.Candidates?[0].Content?.Parts?[0].Text ?? "{}";
            var vector = await generator.GenerateVectorAsync(description);

            await collection.UpsertAsync(new GalleryItem
            {
                Id = id++,
                FilePath = file,
                Description = description,
                Vector = vector
            });

            Console.WriteLine("Done");
        }
        catch (Exception ex) { 
            Console.WriteLine($"{ex.Message}"); 
        }
    }
}

Let's break down what's happening here:

Step 1: File Discovery
We recursively search the specified folder for all .jpg, .jpeg, and .png files. This could be extended to support other formats like .gif, .webp, or even .heic.

Step 2: Image Analysis with Gemini
For each image, we send it to Gemini with a carefully crafted prompt. Notice we're asking for JSON output with specific fields: description, people, clothing, and scenery. This structured approach makes the descriptions more consistent and comprehensive.

The ResponseMimeType = "application/json" configuration ensures Gemini returns valid JSON, which we can parse if needed (though for our purposes, we're using the raw JSON string as the embedding input).

Step 3: Generate Embeddings
The description text is sent to our local Ollama instance, which runs it through the mxbai-embed-large model and returns a 1024-dimensional vector. This happens entirely on your local machine.

Step 4: Store the Vector
We create a GalleryItem with the file path, description, and vector, then upsert it into our collection. The term "upsert" means it will insert if the ID is new or update if it already exists.

The Search Experience

Once indexing is complete, users can search using natural language:

static async Task RunSearchLoop(
    IEmbeddingGenerator<string, Embedding<float>> generator, 
    VectorStoreCollection<int, GalleryItem> collection)
{
    while (true)
    {
        Console.WriteLine("Describe the photo you want (or 'exit'):");
        var query = Console.ReadLine();
        if (string.IsNullOrEmpty(query) || query == "exit") break;

        var queryVector = await generator.GenerateVectorAsync(query);
        var options = new VectorSearchOptions<GalleryItem> { 
            IncludeVectors = false 
        };
        var searchResults = collection.SearchAsync(queryVector, 10, options);

        var hits = new List<GalleryItem>();
        int i = 1;

        Console.WriteLine("Matches:");
        await foreach (var result in searchResults)
        {
            if (result.Score > 0.6)
            {
                hits.Add(result.Record);
                Console.WriteLine($"{i}. {Path.GetFileName(result.Record.FilePath)} [Score: {result.Score:F2}]");
                i++;
            }
        }

        if (hits.Count == 0) { 
            Console.WriteLine("No matches found."); 
            continue; 
        }

        Console.WriteLine("\nSelect 1-10 to open, or 'n' for next search:");
        var choice = Console.ReadLine();
        if (int.TryParse(choice, out int sel) && sel >= 1 && sel <= hits.Count)
        {
            Process.Start(new ProcessStartInfo(hits[sel - 1].FilePath) { 
                UseShellExecute = true 
            });
        }
    }
}

The search process is elegant:

  1. Convert query to vector: User enters "woman in red dress at wedding" and we generate its embedding using the same Ollama model
  2. Vector similarity search: The vector store finds the top 10 most similar images based on cosine similarity
  3. Filter by threshold: We only show results with a similarity score above 0.6 (60%), filtering out weak matches
  4. Interactive selection: Users can choose which result to open, and we launch it in the default image viewer

The beauty of this approach is that the search understands semantic meaning. Searching for "beach sunset" will find images even if those exact words weren't in Gemini's description—it matches the concept.

Understanding the Similarity Score

The similarity score ranges from 0 to 1, where:

  • 0.9-1.0: Nearly identical meaning (very rare)
  • 0.7-0.9: Highly relevant match
  • 0.6-0.7: Good match, likely relevant
  • 0.4-0.6: Moderate match, may or may not be what you want
  • Below 0.4: Weak match, probably not relevant

In our implementation, we use 0.6 as the threshold. This is somewhat conservative—you might lower it to 0.5 for more permissive results or raise it to 0.7 for stricter matching. The optimal threshold often depends on your image collection and use case.

Performance Considerations

Indexing Performance

Indexing is the most time-consuming part, with two primary bottlenecks:

  1. Gemini API calls: These are sequential and rate-limited. With the free tier allowing 60 requests per minute, indexing 1,000 images takes roughly 17 minutes.
  2. Network latency: Each API call includes uploading the image bytes and receiving the response.

Optimization strategies:

  • Batch processing: Process images in parallel (up to your rate limit)
  • Caching: Store descriptions locally and only re-process if the file changes
  • Smart filtering: Skip images you know you don't need to search (screenshots, duplicates, etc.)

Search Performance

Search is remarkably fast:

  • Embedding generation: ~50-200ms locally with Ollama (depends on query length and hardware)
  • Vector search: Sub-millisecond for in-memory collections with thousands of images
  • Total search time: Typically under 300ms

For larger collections (100K+ images), consider:

  • Switching to a persistent vector database with indexed searches (Qdrant, Milvus, etc.)
  • Implementing approximate nearest neighbor (ANN) algorithms like HNSW
  • Pre-computing common query embeddings

Memory Usage

The in-memory vector store is simple but has obvious limitations. Each vector is 1024 floats × 4 bytes = 4KB. With metadata, each GalleryItem is roughly 5-6KB. So:

  • 1,000 images: ~6MB RAM
  • 10,000 images: ~60MB RAM
  • 100,000 images: ~600MB RAM

For most personal photo libraries, this is perfectly acceptable. If you're working with massive collections, the SemanticKernel abstraction makes it easy to swap in a persistent store.

Real-World Enhancements

Here are some practical improvements you might consider:

1. Persistent Storage

Replace the in-memory store with a persistent database:

// Instead of InMemoryVectorStore, use a real database
// For example, with Qdrant:
var vectorStore = new QdrantVectorStore(new QdrantClient("localhost", 6334));

This lets you:

  • Keep your index across application restarts
  • Handle collections too large for RAM
  • Deploy as a service that multiple clients can query

2. Incremental Indexing

Only index new or modified images:

var existingItems = await collection.GetAllAsync();
var existingPaths = existingItems.Select(i => i.FilePath).ToHashSet();

var newFiles = files.Where(f => !existingPaths.Contains(f) || 
    File.GetLastWriteTimeUtc(f) > GetLastIndexedTime(f));

3. Better Prompting

Experiment with different Gemini prompts for better descriptions:

var prompt = @"Analyze this image and provide a detailed description including:
- Main subjects and objects
- Actions and activities
- Colors and visual style
- Setting and environment
- Mood and atmosphere
- Text visible in the image

Format as JSON with fields: subjects, actions, colors, setting, mood, text.";

4. Multi-Modal Queries

Combine text and image queries:

// "Find images similar to this one, but with more people"
var referenceImageVector = await GetVectorForImage(referenceImagePath);
var textVector = await generator.GenerateVectorAsync("group of people");
var combinedVector = CombineVectors(referenceImageVector, textVector, weights: [0.7, 0.3]);

5. Web Interface

Build a simple web UI using Blazor or ASP.NET Core:

app.MapPost("/search", async (SearchRequest req, VectorStoreCollection collection) => 
{
    var vector = await generator.GenerateVectorAsync(req.Query);
    var results = collection.SearchAsync(vector, req.Limit ?? 10);
    return Results.Ok(await results.ToListAsync());
});

6. Metadata Extraction

Enhance indexing with EXIF data (GPS, timestamps, camera settings):

using MetadataExtractor;

var directories = ImageMetadataReader.ReadMetadata(file);
var gpsDirectory = directories.OfType<GpsDirectory>().FirstOrDefault();
var location = gpsDirectory?.GetGeoLocation();

This enables queries like "photos taken in Paris" or "pictures from my camera in 2023."

7. Face Recognition

Integrate face detection/recognition to search for specific people:

// Using Azure Face API or a local library
var faces = await faceDetector.DetectFacesAsync(imageBytes);
description += $" Contains {faces.Count} people.";
if (faces.Any(f => f.MatchesKnownPerson("John")))
{
    description += " John is present.";
}

Cost Analysis

Let's talk about the economics of this solution:

Gemini API Costs

Free tier: 1,500 requests/day
Paid tier: $0.00001875 per image (as of 2024)

For a 10,000 image library:

  • Using free tier: 7 days to index
  • Using paid tier: ~$0.19 total, completed in hours

Ollama Costs

Completely free. Runs on your hardware. A mid-range GPU or modern CPU can generate embeddings in under 200ms.

SemanticKernel/Storage

Free for in-memory. Persistent vector databases have varying pricing:

  • Self-hosted (Qdrant, Milvus): Free, requires server
  • Azure AI Search: ~$75/month for basic tier
  • Pinecone: Free tier available, then usage-based

Alternative Approaches

This architecture is just one way to solve the problem. Here are alternatives worth considering:

Option 1: Fully Local with CLIP

Use OpenAI's CLIP model (available via Hugging Face) for both image and text embeddings:

Pros:

  • Completely local, no API calls
  • Zero cost after setup
  • Better privacy

Cons:

  • More complex setup
  • Requires better hardware (GPU recommended)
  • Slower indexing

Option 2: Cloud-Native with Azure AI Vision

Use Azure Computer Vision for image analysis and Azure AI Search for vector storage:

Pros:

  • Enterprise-grade reliability
  • Scales to millions of images
  • Advanced features (OCR, object detection, etc.)

Cons:

  • Higher costs
  • Data leaves your machine
  • Requires Azure subscription

Option 3: Hybrid with Multiple Models

Combine multiple AI models for different aspects:

Pros:

  • Best-of-breed for each task
  • Can optimize for accuracy vs. cost
  • Flexible architecture

Cons:

  • Increased complexity
  • Multiple dependencies
  • Harder to maintain

Conclusion

We've built a sophisticated image search engine that bridges the gap between how we remember photos and how computers organize them. By combining Gemini's vision capabilities, Ollama's local embeddings, and SemanticKernel's vector search, we've created a solution that's:

  • Powerful: Understands image content semantically
  • Fast: Sub-second search times
  • Private: Searches run entirely locally
  • Cost-effective: Free tier covers most personal use
  • Extensible: Easy to enhance and customize

The combination of cloud AI for heavy lifting (image understanding) and local AI for privacy-sensitive operations (search queries) represents a pragmatic approach to building AI applications. You get the benefits of large, powerful models without sacrificing control over your data.

What's Next?

Here are some ideas to take this project further:

  1. Build a desktop GUI using Avalonia or WPF for a more user-friendly interface
  2. Add filters for date ranges, file types, or custom categories
  3. Implement clustering to automatically group similar images
  4. Create collections that users can manually curate and search within
  5. Add tag suggestions based on content analysis
  6. Export functionality to create albums or share search results
  7. Mobile companion app to search your collection from your phone

The core architecture we've built is flexible enough to support all of these enhancements without major rewrites.

Final Thoughts

As AI capabilities become more accessible, the barrier to building sophisticated applications continues to fall. What would have required a team of ML engineers a few years ago can now be assembled in a weekend with the right combination of APIs and libraries.

The key is knowing which tools to combine. Gemini gives us world-class vision understanding. Ollama provides free, private, local inference. SemanticKernel offers production-ready abstractions for vector operations. Together, they create something greater than the sum of their parts.

I encourage you to experiment with this code, adapt it to your needs, and share what you build. The future of search isn't keyword matching—it's semantic understanding. And with the tools we've explored today, that future is already here.


Source code: The complete implementation is available as a working C# console application. Feel free to use it as a starting point for your own projects.

Questions or improvements? I'd love to hear how you've adapted this approach or what features you've added. The intersection of AI and practical applications is where the most exciting innovation happens.

Happy coding!


Related Tags:

No Comments Yet.

Leave a Comment
Top