This article shows you how to optimize memory allocations in C++ using placement new. Applicable especially to usecases dealing with creation of large number of objects.
A brief on problem:
Lets take example problem for simplicity:
- I have a GSM network with a cell in it.
- Cell's coverage area is divided into a pixel grid with dimensions 2000x2000.
- at each pixel the signal strength is measured.
Requirement is to find out the % of pixels in a cell that have below par signal strength.
Solution 1 using the good old C:
In C, memory is dealt in raw. You want memory, allocate the number of bytes you want. No guarantees are give regarding the contents of that memory (using malloc()). At best you can initialize the whole block of memory you allocated with "zeros" (calloc() or memset()), but not to a value specific to your application (not unless you REALLY go deep into how your specific compiler creates the memory map for your data).
// Solution 1
#include <stdio.h>
#include <stdlib.h>
#define GRID_WIDTH 200
#define GRID_HEIGHT 200
#define MIN_SIGNAL_STRENGTH 3
typedef struct pixelStruct {
int _x;
int _y;
int _signal_strength;
} t_pixel;
void print_pixel( const char* msg, const t_pixel* ptr ) {
cout << msg << endl << "\tptr = 0x" << hex << ptr << ", x = " << ptr->_x << ", y = " << ptr->_y << ", signal_strength = " << ptr->_signal_strength << endl;
}
int main() {
t_pixel* pixelArray[GRID_WIDTH][GRID_HEIGHT];
srand ( 21346 );
int w, h;
// create the pixels..
for ( w = 0; w < GRID_WIDTH; w++ )
for ( h = 0; h < GRID_HEIGHT; h++ )
pixelArray[w][h] = malloc(sizeof(t_pixel));
// see what the memory looks like just after allocation.
print_pixel("after allocation, before initialization", pixelArray[0][0]);
// initialize the pixels..
for ( w = 0; w < GRID_WIDTH; w++ ) {
for ( h = 0; h < GRID_HEIGHT; h++ ) {
t_pixel* ptr = pixelArray[w][h];
ptr->_signal_strength = rand()%10;
ptr->_x = w;
ptr->_y = h;
}
}
// see what the memory looks like after initialization.
print_pixel("after allocation AND initialization", pixelArray[0][0]);
// calculate the % of pixels with below par signal strength
int numPixelsWithLowSignal = 0;
for ( w = 0; w < GRID_WIDTH; w++ ) {
for ( h = 0; h < GRID_HEIGHT; h++ ) {
if ( pixelArray[w][h]->_signal_strength < MIN_SIGNAL_STRENGTH )
++numPixelsWithLowSignal;
}
}
cout << (numPixelsWithLowSignal * 100 / (GRID_WIDTH*GRID_HEIGHT)) << "% of pixels have below par signal strength." << endl;
// free up the memory
for ( w = 0; w < GRID_WIDTH; w++ )
for ( h = 0; h < GRID_HEIGHT; h++ )
free(pixelArray[w][h]);
return 0;
}
/*
// OUTPUT:
after allocation, before initialization
ptr = 0x3e3f48, x = 4063800, y = 4063800, signal_strength = 0
after allocation and initialization
ptr = 0x3e3f48, x = 0, y = 0, signal_strength = 9
29% of pixels have below par signal strength.
*/
Some basics that we can take away from this code are:
1. malloc() only allocates memory. It knows NOTHING about how the memory would be actually used and thus returns a void*. We are using it as pixel* in our case. So each call to malloc(sizeof(pixel)) just marks 12 bytes## (4 bytes for each int member variable) as allocated. It can-do/does NOTHING to contents of the memory it marks.
2. In second loop we initialize each pixel and give it the correct coordinate and a random signal strength.
As the output of the code shows, x, y and signal have some random values before they are initialized in the second loop.
So to summarize: Creation of each pixel can be divided into two parts:
1. memory allocation: pixel* ptr = malloc(sizeof(pixel)); // memory allocation.
2. initialization: ptr->_signal_strength = rand()%10; // initialization
ptr->_x = w; // initialization
ptr->_y = h; // initialization
Solution 2 using C++ new operator()
C++ brings to us two new concepts (compared to C) relevant to our problem.
1. One of the basic improvements we had in C++ classes vs. C structs in the the the concept of constructors**. So that you can initialize the contents of the allocated memory in a more organized manner.
2. operator new()
(instead of malloc).
Simple C++ solution doing the same as previous C code would look like this:
// Solution 2
#include <Common.h> // attached to article
#define GRID_WIDTH 2000
#define GRID_HEIGHT 2000
#define MIN_SIGNAL_STRENGTH 3
class pixel {
public:
pixel(int x, int y, int signal_strength) {
_signal_strength = signal_strength;
_x = x;
_y = y;
}
// C'tor above can also be written using c'tor inializers
// Which actually is faster! Syntax:
// pixel(int x, int y, int signal_strength) : _signal_strength (signal_strength), _x(x), _y(y)
// { }
int _signal_strength ;
int _x;
int _y;
void print_pixel(const char* msg) const {
cout << msg << endl << "\tptr = " << hex << this << ", x = " << _x
<< ", y = " << _y << ", signal_strength = " << _signal_strength
<< endl;
}
};
//main program to call the array for 4 ints and return average
int main() {
startClock();
srand ( 213462434L );
vector<pixel*> pixelVec ;
// create AND initialize pixels..
for ( size_t w = 0; w < GRID_WIDTH; w++ )
for ( size_t h = 0; h < GRID_HEIGHT; h++ )
pixelVec.push_back( new pixel(w, h, rand()%10) );
printClock("Time for default new operator(): ");
// see what the memory looks like..
pixelVec[0]->print_pixel("after allocation AND initialization");
// calculate the % of pixels with below par signal strength
int numPixelsWithLowSignal = 0;
for ( size_t i = 0; i < pixelVec.size(); i++ )
if ( pixelVec[i]->_signal_strength < MIN_SIGNAL_STRENGTH )
++numPixelsWithLowSignal;
cout << dec << (numPixelsWithLowSignal * 100 / (GRID_WIDTH*GRID_HEIGHT)) << "% of pixels have below par signal strength." << endl;
// cleanup..
for ( size_t i = 0; i < pixelVec.size(); i++ )
delete pixelVec[i];
pixelVec.clear();
printClock("Total time including delete: ");
return 0 ;
}
/*
OUTPUT:
Time for default new operator(): 9937487 micro seconds
after allocation AND initialization
ptr = 0x3e3f10, x = 0, y = 0, signal_strength = 4
29% of pixels have below par signal strength.
Total time including delete: 20218721 micro seconds
*/
As we can see the basic difference between Solution 1 & 2 is that we use operator new, which does both memory allocation and initialization (by making a call to the constructor of the class pixel).
In other words this: new pixel(w, h, rand()%10)
is same as: pixel* ptr = malloc(sizeof(pixel)); // allocate
ptr->pixel(w, h, rand()%10); // call constructor to initialize
which in turn is same as: pixel* ptr = malloc(sizeof(pixel));
ptr->_x = w;
ptr->_y = h;
ptr->_signal_strength = rand()%10;
Solution 3 using C++ placement new operator()
As we learned operator new does 2 things 1) memory allocation 2) initialization via a call to constructor.
C++ provides an overloaded version of new operator called placement new, which leaves job of the memory allocation to user. What this means is that if user knows a better / faster way to allocate memory, let him do so and then call placement new operator to perform the initialization.
// Solution 3 using placement new operator.
#include <Common.h> // attached to article
#define GRID_WIDTH 2000
#define GRID_HEIGHT 2000
#define MIN_SIGNAL_STRENGTH 3
class pixel {
public:
pixel(int x, int y, int signal_strength) {
_signal_strength = signal_strength;
_x = x;
_y = y;
}
int _signal_strength ;
int _x;
int _y;
void print_pixel(const char* msg) const {
cout << msg << endl << "\tptr = " << hex << this << ", x = " << _x
<< ", y = " << _y << ", signal_strength = " << _signal_strength
<< endl;
}
};
//main program to call the array for 4 ints and return average
int main() {
startClock();
void* pre_allocated_memory = malloc(GRID_WIDTH * GRID_HEIGHT * sizeof(pixel));
srand ( 213462434L );
vector<pixel*> pixelVec ;
// create AND initialize pixels..
for ( size_t w = 0; w < GRID_WIDTH; w++ )
for ( size_t h = 0; h < GRID_HEIGHT; h++ ) {
// pixelVec.push_back( new pixel(w, h, rand()%10) );
pixelVec.push_back( new (pre_allocated_memory) pixel(w, h, rand()%10) );
// as we've used the address pointed by pre_allocated_memory
// make pre_allocated_memory point to next slot
pre_allocated_memory = (pixel*) pre_allocated_memory + 1;
}
printClock("Time for placement new operator(): ");
// see what the memory looks like..
pixelVec[0]->print_pixel("after allocation AND initialization");
// calculate the % of pixels with below par signal strength
int numPixelsWithLowSignal = 0;
for ( size_t i = 0; i < pixelVec.size(); i++ )
if ( pixelVec[i]->_signal_strength < MIN_SIGNAL_STRENGTH )
++numPixelsWithLowSignal;
cout << dec << (numPixelsWithLowSignal * 100 / (GRID_WIDTH*GRID_HEIGHT)) << "% of pixels have below par signal strength." << endl;
// cleanup..
pixelVec.clear();
free(pre_allocated_memory);
printClock("Total time including delete: ");
return 0 ;
}
/*
Time for placement new operator(): 218748 micro seconds
after allocation AND initialization
ptr = 0x4f0020, x = 0, y = 0, signal_strength = 4
29% of pixels have below par signal strength.
Total time including delete: 484370 micro seconds
*/
As described above, here we know that I would need to create GRID_WIDTH * GRID_HEIGHT objects of type pixel. So instead of calling the new operator GRID_WIDTH * GRID_HEIGHT times to perform the memory allocation, we do it once using the good old malloc(). And now we call the placement new instead which uses required number of bytes inside the pre-allocated memory and then calls the constructor of pixel to do initialization.
Do note this pre_allocated_memory = (pixel*) pre_allocated_memory + 1;
. This statement moves the pointer forward to ensure that next pixel does not use the same memory.
By using placement new, we replace:
- GRID_WIDTH * GRID_HEIGHT
number of memory allocations of sizeof(pixel)
bytes each
with
- one memory allocations of GRID_WIDTH * GRID_HEIGHT * sizeof(pixel)
bytes
This is what gives us the performance boost*#.
In the given example we get 45 times (or 4500%) of improvement. 218748 micro seconds for placement new VS. 9937487 micro seconds default new.
NOTES:
*# Amount of performance improvement would depend on the complexity of constructor code as well. In this example each call to constructor initializes 3 int variables. If the constructor were more complex the % benefit could reduce. But you can expect a big boost in most cases.
## Since there is no standard size for in int defined by the standard this is not necessarily true. It also ignores the fact that there could be data alignment padding in the struct that could augment the size.
** C++ structs can also have constructors unlike C structs.
---------------------------------------------------------------------------------------
A word on freeing memory allocated using placement new operator delete()
does exactly the opposite of what operator new()
does. It:
- un-intializes / destroys the object: by calling destructor.
- frees the memory that was used by teh object.
Unlike the operator new()
, which has a corresponding operator delete(), the placement new operator does not have a corresponding placement operator delete().
So when using placement new, we must ourselves do these 2 things explicitly.
In the example taken in this example the object in question (pixel) is quite trivial and has no need for user defined destruction. In cases where the object being dealt with requires non-trivial destruction, user must explicitly invoke the destructor. E.g. lets say the each pixel instance held onto some system resource (say a file descriptor) then to release such resource, destructor must be invoked in addition to freeing the buffer (pre_allocated_memory).
An example would look like this:
// cleanup
#include <Common.h> // attached to article
#define GRID_WIDTH 2000
#define GRID_HEIGHT 2000
#define MIN_SIGNAL_STRENGTH 3
class pixel {
ifstream* pFileStream;
public:
pixel(int x, int y, int signal_strength) {
_signal_strength = signal_strength;
_x = x;
_y = y;
pFileStream = new ifstream("somefile");
}
~pixel() {
if(NULL != pFileStream) {
pFileStream->close();
pFileStream = NULL;
}
}
void print_pixel(const char* msg) const {
cout << msg << endl << "\tptr = " << hex << this << ", x = " << _x
<< ", y = " << _y << ", signal_strength = " << _signal_strength
<< endl;
}
public:
int _signal_strength ;
int _x;
int _y;
};
//main program to call the array for 4 ints and return average
int main() {
startClock();
void* pre_allocated_memory = malloc(GRID_WIDTH * GRID_HEIGHT * sizeof(pixel));
srand ( 213462434L );
vector<pixel*> pixelVec ;
// create AND initialize pixels..
for ( size_t w = 0; w < GRID_WIDTH; w++ )
for ( size_t h = 0; h < GRID_HEIGHT; h++ ) {
// pixelVec.push_back( new pixel(w, h, rand()%10) );
pixelVec.push_back( new (pre_allocated_memory) pixel(w, h, rand()%10) );
// as we've used the address pointed by pre_allocated_memory
// make pre_allocated_memory point to next slot
pre_allocated_memory = (pixel*) pre_allocated_memory + 1;
}
printClock("Time for placement new operator(): ");
// see what the memory looks like..
pixelVec[0]->print_pixel("after allocation AND initialization");
// calculate the % of pixels with below par signal strength
int numPixelsWithLowSignal = 0;
for ( size_t i = 0; i < pixelVec.size(); i++ )
if ( pixelVec[i]->_signal_strength < MIN_SIGNAL_STRENGTH )
++numPixelsWithLowSignal;
cout << dec << (numPixelsWithLowSignal * 100 / (GRID_WIDTH*GRID_HEIGHT)) << "% of pixels have below par signal strength." << endl;
// cleanup..
// for non-trivial destruction, call the destructor explicitly.
for ( size_t i = 0; i < pixelVec.size(); i++ )
pixelVec[i]->~pixel();
pixelVec.clear();
free(pre_allocated_memory);
printClock("Total time including delete: ");
return 0 ;
}
A word padding and alignment
Placement new does not take any special care of padding or data structure alignment.
Given that sizeof() operator considers the padding while computing the size of given type, it is safe to assume that:
// this loop...
for ( size_t h = 0; h < NUM_OBJECTS; h++ ) {
new (pre_allocated_memory) my_type();
pre_allocated_memory = (my_type*) pre_allocated_memory + 1;
}
// ...will NOT exhaust the pre_allocated_memory as long as
// pre_allocated_memory is allocated using sizeof(). e.g.
void* pre_allocated_memory = malloc(NUM_OBJECTS * sizeof(my_type));
In other words, do not use magic numbers that you comeup with as size of your type. Trust that job to sizeof(). In the example as pixel contains 3 int variables, you could say malloc(NUM_PIXELS * 12);
but DON'T.
Also see this for more info on data alignment and portability.