Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SINGA-236 memory pool (update from an old branch) #254

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
13 changes: 10 additions & 3 deletions include/singa/core/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@


#ifdef USE_OPENCL
#include "singa/utils/opencl_utils.h"
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <CL/cl2.hpp>
#include <unordered_map>
#endif // USE_OPENCL

using std::atomic;
Expand All @@ -62,6 +65,9 @@ class Block {
// Disabled as it is not used currently.
// Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>>
// ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}

// TODO(wangwei) check if the set is correct and add lock if shared sturcture is allowed
void set_data(void* ptr) { data_ = ptr; }
void* mutable_data() {
initialized_ = true;
return static_cast<char*>(data_) + offset_;
Expand Down Expand Up @@ -107,8 +113,9 @@ typedef struct _Context {
#endif // USE_CUDA

#ifdef USE_OPENCL
// This stores the context ID of the OpenCL context controlled by ViennaCL.
long vcl_ctx_id;
std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
cl::CommandQueue ocl_cmdq;
cl::Context ocl_ctx;
#endif

} Context;
Expand Down
52 changes: 52 additions & 0 deletions include/singa/core/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <atomic>
#include "singa/proto/core.pb.h"
#include "singa/singa_config.h"
#include "singa/core/common.h"

#ifdef USE_CUDA
#include "cnmem.h"
Expand Down Expand Up @@ -50,6 +51,57 @@ class DeviceMemPool {
// size_t init_size_ = 0, max_size_ = 0;
};

class CppMemPool {
public:
// initial pool size (MB), and the size of each memory uint in the memory pool (KB)
CppMemPool(size_t init_size_mb = 256, size_t uint_size_kb = 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are all memory units of the same size?
Can we do it in this way:

  1. create a large memory pool P0 at the beginning
  2. when malloc is called, return a block of the asked size and move the offset of the start pointer of the pool Pi
  3. if not enough space for malloc a new block, then create a new block Pi+1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

optimizations could be added later, e.g.
OptimizeMemPool(): re-compute the total size blocks in use, and move them into contiguous memory space.


// return a new pool based on the current pool
// once returned, the old pool will be invalid
// re-initial with pool size (MB), and set the size of each memory uint in the memory pool (KB)
void RsetMemPool(size_t init_size_mb = 256, size_t uint_size_kb = 1);

// create the memory requested, if size is larger than memUintSize, malloc from system call
// is_ptr_null indicate whether the pointer is null and if so we will initialize it in the malloc function,
// otherwise we will use the ptr directly and access its data and functions.
// after the malloc, the data pointer of the block will be changed and the orginal data pointer will be lost.
void Malloc(Block** ptr, const size_t size, bool is_ptr_null = true);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it be simpler if the API is
Block* Malloc(const size_t size)

void Free(Block* ptr);

std::pair<size_t, size_t> GetMemUsage();
size_t GetNumFreeUints(){return numUints - numAllocatedUintsInPool;};

// release all memory.
// all pointers allocated in the pool must be freed before calling the descturctor.
~CppMemPool();

protected:
// each structure define a memory uint in the memory pool
// the structure is a static double linked list
struct _Uint {
struct _Uint *pPrev, *pNext;
Block* pBlk;
};

// pointer to the memory pool
void* pMemPool;

// head pointer to allocated memory uint
struct _Uint* pAllocatedMemUint;
// head pointer to free memory uint
struct _Uint* pFreeMemUint;

// the size of each memory uint with/out the meta data of the uint
size_t memUintSize, memUintSizeNoMeta;

// the number of memory uints in the pool
size_t numUints;
// the number of allocated uints which are resided in the memory pool
size_t numAllocatedUintsInPool;
// the number of allocated uints including the ones resided outside the memory pool
size_t numAllocatedUints;
};

#ifdef USE_CUDA
class CnMemPool : public DeviceMemPool {
public:
Expand Down
153 changes: 151 additions & 2 deletions src/core/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,157 @@
#include "singa/proto/core.pb.h"
#include <iostream>

#ifdef USE_CUDA
namespace singa {

std::pair<size_t, size_t> CppMemPool::GetMemUsage() {
size_t total,free;
total = memUintSize * numUints;
free = total - memUintSize * numAllocatedUintsInPool;
return std::make_pair(free,total);
}

CppMemPool::CppMemPool(size_t init_size_mb, size_t uint_size_kb) {
pMemPool = NULL ;
pAllocatedMemUint = pFreeMemUint = NULL;
memUintSize = memUintSizeNoMeta = 0;
numUints = numAllocatedUintsInPool = numAllocatedUints = 0;
RsetMemPool(init_size_mb,uint_size_kb);
}


void CppMemPool::RsetMemPool(size_t init_size_mb, size_t uint_size_kb) {

if(numAllocatedUintsInPool == 0) { // in the case the pool is empty
// setting up the parameters in the memory pool
const size_t kNBytesPerKB = (1u << 10);
const size_t kNBytesPerMB = (1u << 20);
memUintSize = uint_size_kb * kNBytesPerKB;
memUintSizeNoMeta = memUintSize - sizeof(struct _Uint);
size_t poolSize = init_size_mb * kNBytesPerMB;
bool memAligned = poolSize % memUintSize == 0;
numUints = memAligned ? (poolSize / memUintSize) : (poolSize / memUintSize + 1);
CHECK_GE(numUints,1);
poolSize = memUintSize * numUints;

// intialize the memory pool
pMemPool = malloc(poolSize);
CHECK(pMemPool != NULL);
for(size_t idx = 0; idx < numUints; idx++) {
struct _Uint *pCurUint = (struct _Uint*)((char *)pMemPool + idx * memUintSize);
pCurUint->pPrev = NULL;
pCurUint->pNext = pFreeMemUint;
if(pFreeMemUint != NULL) {
pFreeMemUint->pPrev = pCurUint;
}
pFreeMemUint = pCurUint;
pCurUint->pBlk = NULL;
}
} else { // the pool is not empty, create a new one and copy the old to the new one
CppMemPool* pNewPool = new CppMemPool(init_size_mb, uint_size_kb);
struct _Uint* pCurUint = pAllocatedMemUint;
for(size_t idx = 0; idx < numAllocatedUintsInPool; idx++) {
Block* pOldBlk = pCurUint->pBlk;
void* pData = pOldBlk->mutable_data();
pNewPool->Malloc(&pOldBlk, pOldBlk->size(), false);
size_t copySize = pOldBlk->size() - pOldBlk->offset();
memcpy(pOldBlk->mutable_data(),pData,copySize);
pCurUint = pCurUint->pNext;
}
// swap the new pool with the current
std::swap(pNewPool->pMemPool,pMemPool);
std::swap(pNewPool->pAllocatedMemUint,pAllocatedMemUint);
std::swap(pNewPool->pFreeMemUint,pFreeMemUint);
std::swap(pNewPool->memUintSize,memUintSize);
std::swap(pNewPool->memUintSizeNoMeta,memUintSizeNoMeta);
std::swap(pNewPool->numUints,numUints);
std::swap(pNewPool->numAllocatedUintsInPool,numAllocatedUintsInPool);
pNewPool->numAllocatedUints = 0;
delete pNewPool;
}
}

void CppMemPool::Malloc(Block** ptr, const size_t size, bool is_ptr_null) {
numAllocatedUints++;
// the size is larger than the memory uint size
if(size > memUintSizeNoMeta || pFreeMemUint == NULL) {
void* pData = malloc(size);
if(is_ptr_null) {
*ptr = new Block(pData,size);
} else {
CHECK_EQ((*ptr)->size(),size);
(*ptr)->set_data(pData);
}
return;
}

// otherwise retrieve from one of the memory uint
numAllocatedUintsInPool++;
struct _Uint *pCurUint = pFreeMemUint;
pFreeMemUint = pCurUint->pNext;
if(pFreeMemUint != NULL) {
pFreeMemUint->pPrev = NULL;
}

pCurUint->pNext = pAllocatedMemUint;
if(pAllocatedMemUint != NULL) {
pAllocatedMemUint->pPrev = pCurUint;
}

pAllocatedMemUint = pCurUint;
void* pData = (void*)((char *)pCurUint + sizeof(struct _Uint));
if(is_ptr_null) {
*ptr = new Block(pData,size);
} else {
CHECK_EQ((*ptr)->size(),size);
(*ptr)->set_data(pData);
}
CHECK(pCurUint->pBlk == NULL);
pCurUint->pBlk = *ptr;
}

void CppMemPool::Free(Block* ptr) {
void* pData = ptr->mutable_data();
if(pMemPool < pData && pData < (void*)((char*)pMemPool + numUints * memUintSize)) {
struct _Uint *pCurUint = (struct _Uint*)((char*)pData-sizeof(struct _Uint));
CHECK(ptr == pCurUint->pBlk);

if(pCurUint == pAllocatedMemUint) {
pAllocatedMemUint = pCurUint->pNext;
if(pAllocatedMemUint != NULL) {
pAllocatedMemUint->pPrev = NULL;
}
} else {
struct _Uint *pCurPrevUint = pCurUint->pPrev;
pCurUint->pPrev = NULL;
pCurPrevUint->pNext = pCurUint->pNext;
if(pCurUint->pNext != NULL) {
pCurUint->pNext->pPrev = pCurPrevUint;
}
}

pCurUint->pNext = pFreeMemUint;
if(pFreeMemUint != NULL) {
pFreeMemUint->pPrev = pCurUint;
}

pFreeMemUint = pCurUint;
pCurUint->pBlk = NULL;
numAllocatedUintsInPool--;
}
else {
free(pData);
}
numAllocatedUints--;
delete ptr;
}

CppMemPool::~CppMemPool() {
CHECK_EQ(numAllocatedUints,0);
free(pMemPool);
}


#ifdef USE_CUDA
std::atomic<int> CnMemPool::pool_count(0);
std::pair<size_t, size_t> CnMemPool::GetMemUsage() {
size_t free, total;
Expand Down Expand Up @@ -107,5 +256,5 @@ void CudaMemPool::Free(void *ptr) {
cudaError_t status = cudaFree(ptr);
CHECK_EQ(status, cudaError_t::cudaSuccess);
}
}
#endif
}
Loading