Pod jednym z ostatnich moich wpisów wywiązała się ciekawa dyskusja o klasach std::set oraz std::unordered_set. W związku z tym postanowiłem, by w tym wpisie zbadać zapotrzebowanie pamięciowe powyższych dwóch klas.
Aby móc śledzić alokacje dokonywane przez kontener STL, napisałem allocator pamięci rr_debug_allocator. Klasa ta dziedziczy po std::allocator<T> i jedynym jej zadaniem jest wypisywanie na konsole zajmowanych i zwalnianych obszarów pamięci.

Jako punkt odniesienia przedstawiam zapotrzebowanie na pamięć dla kontenerów std::vector oraz std::list.

STL Custom Allocator

#pragma once

#include <cstdio>
#include <memory>

template < typename T >
class rr_debug_allocator: public std::allocator< T >
{
public:
    using size_type = size_t;
    using pointer = T*;
    using const_pointer = const T*;

    template< typename U >
    struct rebind { using other = rr_debug_allocator<U>; };

    pointer allocate(size_type n, const void *hint=0)
    {
        pointer result = std::allocator<T>::allocate(n, hint);
        const size_t alloc_size = n*sizeof(T);
        printf("  >>  [%d] bytes allocated at [%p] for [%d] x [%d]\n",alloc_size ,result,n,sizeof(T));
        return result;
    }

    void deallocate(pointer p, size_type n)
    {
        printf("  >>  [%d] bytes deallocated at [%p] for [%d] x [%d]\n", n*sizeof(T), p, n,sizeof(T));
        return std::allocator<T>::deallocate(p, n);
    }

    rr_debug_allocator() noexcept: std::allocator<T>() {
        //printf("  >>>>  rr_debug_allocator Constructor\n");
    }

    rr_debug_allocator(const rr_debug_allocator &a) noexcept: std::allocator<T>(a) { }

    template <class U>
    rr_debug_allocator(const rr_debug_allocator<U> &a) noexcept: std::allocator<T>(a) { }

    ~rr_debug_allocator() noexcept {
        //printf("  >>>>  ~rr_debug_allocator Destructor\n");
    }
};

Aplikacja testowa

#include "custom_alocator.h"

#include <vector>
#include <list>
#include <set>
#include <unordered_set>

using element_t     = int;
using allocator_t   = rr_debug_allocator< element_t >;

constexpr int ELEMENTS_COUNT = 4;

void test_vector()
{
    printf("test_vector\n");
    std::vector < element_t, allocator_t >  container;
    for( int i=0;i<ELEMENTS_COUNT; ++i)
        container.push_back(i);
}

void test_list()
{
    printf("test_list\n");
    std::list < element_t, allocator_t >  container;
    for( int i=0;i<ELEMENTS_COUNT; ++i)
        container.push_back(i);
}

void test_set()
{
    printf("test_set\n");
    std::set < element_t, std::less<element_t>, allocator_t >  container;
    for( int i=0;i<ELEMENTS_COUNT; ++i)
        container.insert(i);
}

void test_unordered_set()
{
    printf("test_unordered_set\n");
    std::unordered_set < element_t, std::hash<element_t>, std::equal_to<element_t>, allocator_t >  container;
    for( int i=0;i<ELEMENTS_COUNT; ++i)
        container.insert(i);
}

int main() {
    printf("sizeof(void*) = %d\n==============\n",sizeof(void*));
    test_vector();
    printf("\n");
    test_list();
    printf("\n");
    test_set();
    printf("\n");
    test_unordered_set();
    printf("\n");

    return 0;
}

Analiza otrzymanych wyników

Logi dla trybu 32 bit

sizeof(void*) = 4
==============
test_vector
  >>  [4] bytes allocated at [00fba190] for [1] x [4]
  >>  [8] bytes allocated at [00fba000] for [2] x [4]
  >>  [4] bytes deallocated at [00fba190] for [1] x [4]
  >>  [16] bytes allocated at [00fb6e38] for [4] x [4]
  >>  [8] bytes deallocated at [00fba000] for [2] x [4]
  >>  [16] bytes deallocated at [00fb6e38] for [4] x [4]
 
test_list
  >>  [12] bytes allocated at [00fb6e38] for [1] x [12]
  >>  [12] bytes allocated at [00fb0568] for [1] x [12]
  >>  [12] bytes allocated at [00fb0580] for [1] x [12]
  >>  [12] bytes allocated at [00fba648] for [1] x [12]
  >>  [12] bytes deallocated at [00fb6e38] for [1] x [12]
  >>  [12] bytes deallocated at [00fb0568] for [1] x [12]
  >>  [12] bytes deallocated at [00fb0580] for [1] x [12]
  >>  [12] bytes deallocated at [00fba648] for [1] x [12]
 
test_set
  >>  [20] bytes allocated at [00fb0568] for [1] x [20]
  >>  [20] bytes allocated at [00fb0588] for [1] x [20]
  >>  [20] bytes allocated at [00fba648] for [1] x [20]
  >>  [20] bytes allocated at [00fba668] for [1] x [20]
  >>  [20] bytes deallocated at [00fba668] for [1] x [20]
  >>  [20] bytes deallocated at [00fba648] for [1] x [20]
  >>  [20] bytes deallocated at [00fb0588] for [1] x [20]
  >>  [20] bytes deallocated at [00fb0568] for [1] x [20]
 
test_unordered_set
  >>  [8] bytes allocated at [00fba0c0] for [1] x [8]
  >>  [8] bytes allocated at [00fba0d0] for [2] x [4]
  >>  [8] bytes allocated at [00fba100] for [1] x [8]
  >>  [20] bytes allocated at [00fb0568] for [5] x [4]
  >>  [8] bytes deallocated at [00fba0d0] for [2] x [4]
  >>  [8] bytes allocated at [00fba000] for [1] x [8]
  >>  [8] bytes allocated at [00fba030] for [1] x [8]
  >>  [8] bytes deallocated at [00fba030] for [1] x [8]
  >>  [8] bytes deallocated at [00fba000] for [1] x [8]
  >>  [8] bytes deallocated at [00fba100] for [1] x [8]
  >>  [8] bytes deallocated at [00fba0c0] for [1] x [8]
  >>  [20] bytes deallocated at [00fb0568] for [5] x [4]

Logi dla trybu 64 bit

sizeof(void*) = 8
==============
test_vector
  >>  [4] bytes allocated at [0000000000b514e0] for [1] x [4]
  >>  [8] bytes allocated at [0000000000b51500] for [2] x [4]
  >>  [4] bytes deallocated at [0000000000b514e0] for [1] x [4]
  >>  [16] bytes allocated at [0000000000b514e0] for [4] x [4]
  >>  [8] bytes deallocated at [0000000000b51500] for [2] x [4]
  >>  [16] bytes deallocated at [0000000000b514e0] for [4] x [4]

test_list
  >>  [24] bytes allocated at [0000000000b514e0] for [1] x [24]
  >>  [24] bytes allocated at [0000000000b51500] for [1] x [24]
  >>  [24] bytes allocated at [0000000000b51520] for [1] x [24]
  >>  [24] bytes allocated at [0000000000b51540] for [1] x [24]
  >>  [24] bytes deallocated at [0000000000b514e0] for [1] x [24]
  >>  [24] bytes deallocated at [0000000000b51500] for [1] x [24]
  >>  [24] bytes deallocated at [0000000000b51520] for [1] x [24]
  >>  [24] bytes deallocated at [0000000000b51540] for [1] x [24]

test_set
  >>  [40] bytes allocated at [0000000000b514e0] for [1] x [40]
  >>  [40] bytes allocated at [0000000000b51510] for [1] x [40]
  >>  [40] bytes allocated at [0000000000b51540] for [1] x [40]
  >>  [40] bytes allocated at [0000000000b51570] for [1] x [40]
  >>  [40] bytes deallocated at [0000000000b51570] for [1] x [40]
  >>  [40] bytes deallocated at [0000000000b51540] for [1] x [40]
  >>  [40] bytes deallocated at [0000000000b51510] for [1] x [40]
  >>  [40] bytes deallocated at [0000000000b514e0] for [1] x [40]

test_unordered_set
  >>  [16] bytes allocated at [0000000000b514e0] for [1] x [16]
  >>  [16] bytes allocated at [0000000000b51500] for [2] x [8]
  >>  [16] bytes allocated at [0000000000b51520] for [1] x [16]
  >>  [40] bytes allocated at [0000000000b51540] for [5] x [8]
  >>  [16] bytes deallocated at [0000000000b51500] for [2] x [8]
  >>  [16] bytes allocated at [0000000000b51500] for [1] x [16]
  >>  [16] bytes allocated at [0000000000b51570] for [1] x [16]
  >>  [16] bytes deallocated at [0000000000b51570] for [1] x [16]
  >>  [16] bytes deallocated at [0000000000b51500] for [1] x [16]
  >>  [16] bytes deallocated at [0000000000b51520] for [1] x [16]
  >>  [16] bytes deallocated at [0000000000b514e0] for [1] x [16]
  >>  [40] bytes deallocated at [0000000000b51540] for [5] x [8]

Podsumowanie

Przeprowadzony eksperyment spełniał następujące warunki:

  • mamy N elementów do wstawiania.
  • kontenery były tworzone z domyślnymi ustawieniami.
  • kompilator MinGW-w64 (w64 4.0).

std::vector

Element w kontenerze zajmuje dokładnie tyle samo pamięci co element poza kontenerem. W najgorszym przypadku całkowita pamięć zaalokowana przez kontener będzie równa M = \frac{3}{2}N * sizeof(T)

std::list

Element w kontenerze potrzebuje dwóch wskaźników na kolejny i poprzedni węzeł listy. M = 2N * sizeof(void\star) + N * sizeof(T)

std::set

Dla każdego elementu kontener tworzy nowy węzeł drzewa. Każdy taki węzeł potrzebuje przechować dwa wskaźniki, przekazaną wartość oraz pewne wartości zależne od implementacji (głębokość obydwu pod-drzew potrzebna do balansowania drzewem) M = 4N * sizeof(void\star) + N * sizeof(T)

std::unordered_set

Wewnętrzna implementacja kontenera std::unordered_set, w mojej wersji gcc, to połączenie klasy vector i list. Każdy element to węzeł listy jednokierunkowej, z tą jednak różnicą, że zamiast jednej listy mamy pewną ilość list K. Wartość K jest zależna od ilości elementów w kontenerze, podobnie jak ma to miejsce w klasie vector. Wraz z dodawaniem nowych elementów rozmiar K jest zwiększany. Przybliżoną wartość dla najgorszego przypadku możemy policzyć następująco. M \approx N*sizeof(T) + 3N*sizeof(void\star)