// namespaces namespace xf::graph::internal namespace xf::graph::internal::bfs namespace xf::graph::internal::calc_degree namespace xf::graph::internal::connected_components namespace xf::graph::internal::convert_csr_csc namespace xf::graph::internal::diameter namespace xf::graph::internal::hash_group_aggregate namespace xf::graph::internal::label_propagation namespace xf::graph::internal::mis namespace xf::graph::internal::mssp namespace xf::graph::internal::mst namespace xf::graph::internal::pagerank namespace xf::graph::internal::pagerankMultiChannel namespace xf::graph::internal::scc namespace xf::graph::internal::sssp namespace xf::graph::internal::sssp::nopred namespace xf::graph::internal::sssp::pred namespace xf::graph::internal::triangle_count namespace xf::graph::merge
bfs
#include "bfs.hpp"
template <int MAXOUTDEGREE> void bfs ( const int srcID, const int numVertex, ap_uint <512>* offsetCSR, ap_uint <512>* indexCSR, ap_uint <512>* queue512, ap_uint <32>* queue32, ap_uint <512>* color512, ap_uint <32>* dtime, ap_uint <32>* ftime, ap_uint <32>* pred, ap_uint <32>* distance )
bfs Implement the directed graph traversal by breath-first search algorithm
Parameters:
MAXOUTDEGREE | the maximum outdegree of input graphs. Large value will result in more URAM usage. |
srcID | the source vertex ID for this search, starting from 0 |
numVertex | vertex number of the input graph |
indexCSR | column index of CSR format |
offsetCSR | row offset of CSR format |
color512 | intermediate color map which should be shared with dtime, pred or distance in host. |
queue32 | intermediate queue32 used during the BFS |
dtime | the result of discovery time stamp for each vertex |
ftime | the result of finish time stamp for each vertex |
pred | the result of parent index of each vertex |
distance | the distance result from given source vertex for each vertex |
calcuDegree
#include "calc_degree.hpp"
template < int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE > void calcuDegree ( int numVertex, int numEdge, ap_uint <512>* index, ap_uint <512>* degree )
calculate degree algorithm is implemented
Parameters:
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
numVertex | CSR/CSC data vertex number |
numEdge | CSR/CSC data edge number |
index | input CSR/CSC data index array |
degree | output degree array |
calcuWeightedDegree
calcuWeightedDegree overload (1)
#include "calc_degree.hpp"
template < int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE > void calcuWeightedDegree ( int numVertex, int numEdge, ap_uint <512> index [MAXEDGE], ap_uint <512> weight [MAXEDGE], ap_uint <512> degree [MAXVERTEX] )
calculate weighted degree algorithm is implemented
Parameters:
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
numVertex | CSR/CSC data vertex number |
numEdge | CSR/CSC data edge number |
index | input CSR/CSC data index array |
weight | input CSR/CSC data weight array, default float type. |
degree | output degree array, default float type. |
calcuWeightedDegree overload (2)
#include "calc_degree.hpp"
template < int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE, int unrollbin, int CHANNELNUM > void calcuWeightedDegree ( int numVertex, int numEdge, int numEdgePerChannel [CHANNELNUM], ap_uint <512> index [MAXEDGE], ap_uint <512> weight [MAXEDGE], ap_uint <512> degree [MAXVERTEX] )
calculate weighted degree algorithm is implemented
Parameters:
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
unrollbin | log 2 of data numbers per buff, for buffType is 512b, and data type is double(64b), so there is 8 data in one buff, 8=2^3, the unrollbin is 3 |
CHANNELNUM | pingpong channel number for the design, 2 channel will take 14 persudo channels of HBM while 6 channel will take 26 persudo channels of HBM in multi-channel pagerank |
numVertex | CSR/CSC data vertex number |
numEdge | CSR/CSC data edge number |
numEdgePerChannel | the number of edge per channel to deal with |
index | input CSR/CSC data index array |
weight | input CSR/CSC data weight array, default float type. |
degree | output degree array, default float type. |
connectedComponents
#include "connected_components.hpp"
template < int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE, int MAXOUTDEGREE > void connectedComponents ( const int numEdge, const int numVertex, ap_uint <512>* offsetCSR, ap_uint <512>* indexCSR, ap_uint <512>* offsetCSC, ap_uint <512>* indexCSC512, ap_uint <32>* indexCSC32, ap_uint <512>* offsetCSCTmp1, ap_uint <512>* offsetCSCTmp2, ap_uint <512>* queue512, ap_uint <32>* queue32, ap_uint <512>* component512, ap_uint <32>* component32 )
connectedComponents Compute the connected component membership of each vertex only for undirected graph
Parameters:
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | The log2 of number of data in each cache line (512bit), for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
MAXOUTDEGREE | the max sum of indegree and outdegree of the input graph supported. Large value will result in more URAM usage |
numEdge | edge number of the input graph |
numVertex | vertex number of the input graph |
indexCSR | column index of CSR format |
offsetCSR | row offset of CSR format |
indexCSC512 | column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host |
indexCSC32 | column index of CSC format in 32bit, which should be shared the same buffer with indexCSC512 in host |
offsetCSC | row offset of CSC format |
offsetCSCTmp1 | temp row offset for CSR2CSC convert |
offsetCSCTmp2 | temp row offset of CSR2CSC convert |
queue32 | intermediate queue32 used during the internal BFS |
component512 | Same as component32 but in 512bit, which shoude be shared the same buffer with component32 in host |
component32 | return result buffer with the vertex label containing the lowest vertex id in the connnected component containing that vertex |
convertCsrCsc
#include "convert_csr_csc.hpp"
template < typename DT, int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE > void convertCsrCsc ( int numEdge, int numVertex, ap_uint <512>* offsetIn, ap_uint <512>* indexIn, ap_uint <512>* offsetOut, DT* indexOut, ap_uint <512>* offsetTmp0, ap_uint <512>* offsetTmp1 )
convert Csr Csc algorithm is implemented
Parameters:
DT | data type of the input and output, [float, double] |
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
numEdge | CSR/CSC data indices number |
numVertex | CSR/CSC data offsets number |
indexIn | original CSR/CSC data indice array |
offsetIn | original CSR/CSC data offset array |
indexOut | output transfered CSC/CSR data indice array |
offsetOut | output transfered CSC/CSR data offset array |
offsetTmp0 | internal temporary CSC/CSR data offset array |
offsetTmp1 | internal temporary CSC/CSR data offset array |
denseSimilarityCoeffs
#include "dense_similarity_coeffs.hpp"
template < int CHNM, int WData, int RAM_SZ, int MAXK > void denseSimilarityCoeffs ( ap_int <32>* config, ap_int <32>* sourceWeight, ap_int <32>* sourceCoeffs, ap_int <32*CHNM>* dataIn00, ap_int <32*CHNM>* dataIn01, ap_int <32*CHNM>* dataIn02, ap_int <32*CHNM>* dataIn03, ap_int <32*CHNM>* dataIn10, ap_int <32*CHNM>* dataIn11, ap_int <32*CHNM>* dataIn12, ap_int <32*CHNM>* dataIn13, ap_int <32*CHNM>* dataIn20, ap_int <32*CHNM>* dataIn21, ap_int <32*CHNM>* dataIn22, ap_int <32*CHNM>* dataIn23, ap_int <32>* resultID, float* similarity )
similarity function for dense graph. It support both Jaccard and Cosine Similarity.
Parameters:
CHNM | the channel number of input data |
WData | the width of input data |
RAM_SZ | the log size of internal URAM |
MAXK | the max supporting number of insert sort function |
config | the control parameter of the primitive which contains: sourceNUM, similarityType, dataType, startID, rowNUM and colNUM of each processing unit(PU) |
sourceWeight | input weight as source vertex for computing similarity |
sourceCoeffs | input coefficient as a scale factor for the source vertex |
dataIn00 | input muti-channel data from HBM0 for PU0 |
dataIn01 | input muti-channel data from HBM1 for PU0 |
dataIn02 | input muti-channel data from HBM2 for PU0 |
dataIn03 | input muti-channel data from HBM3 for PU0 |
dataIn10 | input muti-channel data from HBM4 for PU1 |
dataIn11 | input muti-channel data from HBM5 for PU1 |
dataIn12 | input muti-channel data from HBM6 for PU1 |
dataIn13 | input muti-channel data from HBM7 for PU1 |
dataIn20 | input muti-channel data from HBM8 for PU2 |
dataIn21 | input muti-channel data from HBM9 for PU2 |
dataIn22 | input muti-channel data from HBM10 for PU2 |
dataIn23 | input muti-channel data from HBM11 for PU2 |
rowID | output result ID |
similarity | output similarity value corresponding to its ID |
estimated_diameter
#include "diameter.hpp"
void estimated_diameter ( unsigned numVert, unsigned numEdge, unsigned* offset, unsigned* column, float* weight, float* max_dist, unsigned* src, unsigned* des )
diameter estimate based on the sssp algorithm
Parameters:
numVert | vertex number of the input graph |
numEdge | edge number of the input graph |
offset | row offset of CSR format |
column | column index of CSR format |
weight | weight value of CSR format |
max_distance | the result of max distance |
src | the result of source vertex |
des | the result of destination vertex |
labelPropagation
#include "label_propagation.hpp"
void labelPropagation ( int numEdge, int numVertex, int numIter, ap_uint <512>* offsetCSR, ap_uint <512>* indexCSR, ap_uint <512>* offsetCSC, ap_uint <512>* indexCSC, ap_uint <512>* pingHashBuf, ap_uint <512>* pongHashBuf, ap_uint <512>* labelPing, ap_uint <512>* labelPong )
labelPropagation the label propagation algorithm is implemented
Parameters:
numEdge | edge number of the graph |
numVertex | vertex number of the graph |
numIter | iteration number |
indexCSR | column index of CSR format |
offsetCSR | row offset of CSR format |
indexCSC | row index of CSC format |
offsetCSC | column of CSC format |
labelPing | label ping buffer |
labelPong | label pong buffer |
kernelLouvainTop
#include "louvain_coloring.hpp"
void kernelLouvainTop ( int64_t* config0, DWEIGHT* config1, ap_uint <CSRWIDTHS>* offsets, ap_uint <CSRWIDTHS>* indices, ap_uint <CSRWIDTHS>* weights, ap_uint <COLORWIDTHS>* colorAxi, ap_uint <COLORWIDTHS>* colorInx, ap_uint <DWIDTHS>* cidPrev, ap_uint <DWIDTHS>* cidSizePrev, ap_uint <DWIDTHS>* totPrev, ap_uint <DWIDTHS>* cidCurr, ap_uint <DWIDTHS>* cidSizeCurr, ap_uint <DWIDTHS>* totCurr, ap_uint <DWIDTHS>* cidSizeUpdate, ap_uint <DWIDTHS>* totUpdate, ap_uint <DWIDTHS>* cWeight, ap_uint <CSRWIDTHS>* offsetsDup, ap_uint <CSRWIDTHS>* indicesDup, ap_uint <8>* flag, ap_uint <8>* flagUpdate )
Level2: louvain kernel implement.
Parameters:
config0 | table for the int64 parameter of louvain, numVertex, numColor, total iteration times and so on. |
config1 | table for the double parameter of louvain, epsilon, constant_recip, modularity and so on. |
offsets | CSR data offset array. |
indices | CSR data index array. |
weights | CSR data weight array, support type float. |
colorAxi | color result before ordered. |
colorInx | color result of vertexes before louvain phase. |
cidPrev | pingpang buffer of community id (cid) for all vertex |
cidSizePrev | pingpang buffer of size of community for all cid |
totPrev | pingpang buffer of tot of community for all cid |
cidCurr | pingpang buffer of community id (cid) for all vertex |
cidSizeCurr | pingpang buffer of size of community for all cid |
totCurr | pingpang buffer of tot of community for all cid |
cidSizeUpdate | buffer for update |
totUpdate | buffer for update |
cWeight | community weight of all cid |
offsetsDup | duplicate of offset for fast pruning |
indicesDup | duplicate of index for fast pruning |
flag | pingpang buffer of pruning flag |
flagUpdate | pingpang buffer of pruning flag |
mis
#include "maximal_independent_set.hpp"
template <int MAX_ROUND> void mis ( int m, const int* offset, const int* indices, int* C_group_0, int* C_group_1, int* S_group_0, int* S_group_1, int* res_out )
mis Implement the algorithm which for finding a maximal set of independent vertices
Parameters:
MAX_ROUND | the max round number for mis algorithm iterations |
m | the vertex number of current graph |
offset | offset for undirect-graph |
indices | indices for undirect-graph |
C_group_0 | status field, C-0 |
C_group_1 | status field, C-1 |
S_group_0 | status field, S-0 |
S_group_1 | status field, S-1 |
res_out | selected mis result(output) |
merge_kernel_core
#include "merge.hpp"
void merge_kernel_core ( int num_v, int num_e, int num_c_out, int* num_e_out, DF_V_T* offset_in, DF_D_T* edges_in, DF_W_T* weights_in, DF_V_T* c, DF_V_T* offset_out, DF_D_T* edges_out, DF_W_T* weights_out, DF_V_T* count_c_single, DF_V_T* jump, DF_V_T* count_c, DF_V_T* index_c )
In graph algorithm, sometimes we need to merge a large graph to a small graph based on specific sequence.
Parameters:
num_v | input vertex number, read only and the depth is 1 |
num_e | input edge number. read only and the depth is 1 |
num_c_out | output vertex number, read only and the depth is 1 |
num_e_out | output edge number, write only and the depth is 1 |
offset_in | input vertex offset, read only and the depth is NV |
edges_in | edges for each input vertex, read only and the depth is NE |
weights_in | weight for each input edge, read only and the depth is NE |
c | map the original vertex id to merged vertex, read only and the depth is NV |
offset_out | output vertex offset, write only and the depth is NV |
edges_out | edges for each output vertex, write only and the depth is NE |
weights_out | weight for each output edge, write only and the depth is NE |
count_c_single | get the number of each c, read / write and the depth is NC |
jump | random vertex access sequence, read / write and the depth is NV |
count_c | aggregate offset for the c based on count_c_single, read / write and the depth is NC |
index_c | get the id for the c in same cid, read / write and the depth is NV |
mst
#include "mst.hpp"
void mst ( unsigned int allVert, unsigned int allEdge, unsigned int source, unsigned int* offset, unsigned int* column, float* weight, unsigned* mstRes )
minimum spanning tree based on the Prim algorithm
Parameters:
allVert | vertex number of the input graph |
allEdge | edge number of the input graph |
source | starting point of the Prim algorithm |
offset | row offset of CSR format |
column | column index of CSR format |
weight | weight value of CSR format |
mstRes | the result of the MST. To get the parent vertex of a vertex with ID V in the generated tree, parent=mstRes[V]. |
pageRankTop
pageRankTop overload (1)
#include "pagerank.hpp"
template < typename T, int MAXVERTEX, int MAXEDGE, int LOG2UNROLL, int WIDTHOR, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINECORE, int LOG2DATAPERCACHELINEDEGREE, int RAMTYPE > void pageRankTop ( int numVertex, int numEdge, ap_uint <512>* degreeCSR, ap_uint <512>* offsetCSC, ap_uint <512>* indexCSC, ap_uint <512>* weightCSC, ap_uint <512>* cntValFull, ap_uint <512>* buffPing, ap_uint <512>* buffPong, ap_uint <WIDTHOR>* orderUnroll, int* resultInfo, T randomProbability = 1.0, T alpha = 0.85, T tolerance = 1e-4, int numIter = 200 )
pagerank algorithm is implemented
Parameters:
T | date type of pagerank, double or float |
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2UNROLL | log2 of unroll number, due to DDR limit, best LOG2UNROLL is 3 |
WIDTHOR | order array bandwidth, it’s 256 in our case |
LOG2CACHEDEPTH | log2(cache depth), the onchip memory for cache is 512 bit x CACHEDEPTH (512 bit x 2^LOG2CACHEDEPTH) |
LOG2DATAPERCACHELINECORE | param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4 |
LOG2DATAPERCACHELINEDEGREE | param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
numVertex | CSR/CSC data offsets number |
numEdge | CSR/CSC data indices number |
degreeCSR | temporary internal degree value |
offsetCSC | CSR/CSC data offset array |
indexCSC | CSR/CSC data indice array |
weightCSC | CSR/CSC data weight array, support type float |
cntValFull | temporary internal initialized mulplier values, length equals to numVertex |
buffPing | ping array to keep temporary pagerank value |
buffPong | pong array to keep temporary pagerank value |
orderUnroll | temporary internal order array to keep initialized offset values |
resultInfo | The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations. |
randomProbability | initial PR value, normally 1.0 or 1.0/numVertex |
alpha | damping factor, normally 0.85 |
tolerance | converge tolerance |
numIter | max iteration |
pageRankTop overload (2)
#include "pagerank_multi_channels.hpp"
template < typename T, int MAXVERTEX, int MAXEDGE, int LOG2UNROLL, int WIDTHOR, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINECORE, int LOG2DATAPERCACHELINEDEGREE, int RAMTYPE > void pageRankTop ( int numVertex, int numEdge, int nsource, ap_uint <32>* sourceID, ap_uint <512>* degreeCSR, ap_uint <512>* offsetCSC, ap_uint <512>* indexCSC, ap_uint <512>* weightCSC, ap_uint <512>* cntValFull0, ap_uint <512>* buffPing0, ap_uint <512>* buffPong0, ap_uint <512>* cntValFull1, ap_uint <512>* buffPing1, ap_uint <512>* buffPong1, ap_uint <WIDTHOR>* orderUnroll, int* resultInfo, T randomProbability = 1.0, T alpha = 0.85, T tolerance = 1e-4, int numIter = 200 )
pagerankMultiChannel algorithm is implemented support: 1. HBM based board
- double / float for PR value calculate
- weighted / unweighted graph / personalized graph
- 2 channel / 6 channel, 2 channel will take 14 persudo channels of HBM while 6 channel will take 26 persudo channels of HBM
Parameters:
T | date type of pagerank, double or float |
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2UNROLL | log2 of unroll number for float adder |
WIDTHOR | order array bandwidth, it’s 256 in our case |
LOG2CACHEDEPTH | log2(cache depth), the onchip memory for cache is 512bits x CACHEDEPTH (512 bits x 2^LOG2CACHEDEPTH) |
LOG2DATAPERCACHELINECORE | param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4 |
LOG2DATAPERCACHELINEDEGREE | param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2: BRAM |
CHANNEL_NUM | pingpong channel number for the design |
numVertex | CSR/CSC data offsets number |
numEdge | CSR/CSC data indices number |
nsource | source vertex ID number, 0 means not apply pagerank_personalized |
sourceID | source vertex ID array for pagerank_personalized |
degreeCSR | temporary internal degree value |
offsetCSC | CSR/CSC data offset array |
indexCSC | CSR/CSC data indice array |
weightCSC | CSR/CSC data weight array, support type float |
cntValFull0 | temporary internal initialized mulplier values, length equals to numVertex |
buffPing0 | ping array to keep temporary pagerank value |
buffPong0 | pong array to keep temporary pagerank value |
cntValFull1 | temporary internal initialized mulplier values, length equals to numVertex |
buffPing1 | ping array to keep temporary pagerank value |
buffPong1 | pong array to keep temporary pagerank value |
orderUnroll | temporary internal order array to keep initialized offset values |
resultInfo | The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations. |
randomProbability | initial PR value, normally 1.0 or 1.0/numVertex |
alpha | damping factor, normally 0.85 |
tolerance | converge tolerance |
numIter | max iteration |
renumberCore
#include "renumber.hpp"
void renumberCore ( int32_t NV, int32_t& numClusters, ap_int < (32)>* oldCids, ap_int < (32)>* mapCid0, ap_int < (32)>* mapCid1, ap_int < (32)>* newCids )
Renumbering recode the categorized graph’s table, and it support 64M for input.
Parameters:
NV | the size of vertices. |
numClusters | the size of renumbered. |
oldCids | the input table for vertices. |
mapCid0 | the intermediate memory for new value, and the memory is writen directly after renumbering. |
mapCid1 | the duplicate memory for mapCid0 |
newCids | the output for renumbering, the value is recoded from 0 to numClusters. |
singleSourceShortestPath
singleSourceShortestPath overload (1)
#include "shortest_path.hpp"
template < int WIDTH, int MAXOUTDEGREE > void singleSourceShortestPath ( ap_uint <32>* config, ap_uint <512>* offsetCSR, ap_uint <512>* indexCSR, ap_uint <512>* weightCSR, ap_uint <512>* queue512, ap_uint <32>* queue32, ap_uint <512>* distance512, ap_uint <WIDTH>* distance32, ap_uint <512>* pred512, ap_uint <32>* pred32, ap_uint <8>* info )
singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.
Parameters:
WIDTH | date width of the weight and the result distance |
MAXOUTDEGREE | The max out put degree of the input graph supported. Large max out degree value |
config | The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(1) is the path switch, config[4].get_bit(2) is the fixed/float switch. config[5] is sourceID. |
offsetCSR | The offsetCSR buffer that stores the offsetCSR data in CSR format |
indexCSR | The indexCSR buffer that stores the indexCSR dada in CSR format |
weightCSR | The weight buffer that stores the weight data in CSR format |
queue512 | The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32. |
queue32 | The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512. |
distance512 | The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port. |
distance32 | The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port. |
pred512 | The predecessor data. The width is 512. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port. |
pred32 | The predecessor data is stored here. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port. |
info | The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree. |
singleSourceShortestPath overload (2)
#include "shortest_path.hpp"
template < int WIDTH, int MAXOUTDEGREE > void singleSourceShortestPath ( ap_uint <32>* config, ap_uint <512>* offsetCSR, ap_uint <512>* indexCSR, ap_uint <512>* weightCSR, ap_uint <512>* queue512, ap_uint <32>* queue32, ap_uint <512>* distance512, ap_uint <WIDTH>* distance32, ap_uint <8>* info )
singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.
Parameters:
WIDTH | date width of the weight and the result distance |
MAXOUTDEGREE | The max out put degree of the input graph supported. Large max out degree value |
config | The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(2) is the fixed/float switch. config[5] is the sourceID. |
offsetCSR | The offsetCSR buffer that stores the offsetCSR data in CSR format |
indexCSR | The indexCSR buffer that stores the indexCSR dada in CSR format |
weightCSR | The weight buffer that stores the weight data in CSR format |
queue512 | The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32. |
queue32 | The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512. |
distance512 | The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port. |
distance32 | The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port. |
info | The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree. |
stronglyConnectedComponents
#include "strongly_connected_components.hpp"
template < int MAXVERTEX, int MAXEDGE, int LOG2CACHEDEPTH, int LOG2DATAPERCACHELINE, int RAMTYPE, int MAXOUTDEGREE > void stronglyConnectedComponents ( const int edgeNum, const int vertexNum, ap_uint <512>* offsetCSR0, ap_uint <512>* indexCSR0, ap_uint <512>* offsetCSC, ap_uint <512>* indxeCSC512, ap_uint <32>* indexCSC32, ap_uint <512>* offsetCSR1, ap_uint <512>* indexCSR1, ap_uint <512>* offsetCSCTmp1, ap_uint <512>* offsetCSCTmp2, ap_uint <512>* colorCSR0512, ap_uint <32>* colorCSR032, ap_uint <32>* queueCSR0, ap_uint <512>* colorCSC512, ap_uint <32>* colorCSC32, ap_uint <32>* queueCSC, ap_uint <32>* queueCSR1, ap_uint <32>* component )
stronglyConnectedComponents Compute the strongly connected component membership of each vertex only for directed graph, and label each vertex with one value containing the lowest vertex id in the SCC containing that vertex.
Parameters:
MAXVERTEX | CSC/CSR data vertex(offset) array maxsize |
MAXEDGE | CSC/CSR data edge(indice) array maxsize |
LOG2CACHEDEPTH | cache depth in Binary, the cache onchip memory is 512 bit x uramRow |
LOG2DATAPERCACHELINE | number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4 |
RAMTYPE | flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM |
MAXOUTDEGREE | the max indegree or outdegree of the input graph supported. Large value will result in more URAM usage |
edgeNum | edge number of the input graph |
vertexNum | vertex number of the input graph |
indexCSR0 | column index of CSR format |
offsetCSR0 | row offset of CSR format |
indxeCSC512 | column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host |
indexCSC32 | column index of CSC format in 32bit, which should be shared the same buffer with indxeCSC512 in host |
offsetCSC | row offset of CSC format |
indexCSR1 | column index of CSR format, which should be shared the same buffer with indexCSR0 in host |
offsetCSR1 | row offset of CSR format, which should be shared the same buffer with offsetCSR0 in host |
offsetCSCTmp1 | temp row offset for CSR2CSC convert |
offsetCSCTmp2 | temp row offset of CSR2CSC convert |
colorCSR0512 | intermediate color map in 512bit for forward-BFS |
colorCSR032 | intermediate color map in 32bit for forward-BFS, which should be shared the same buffer with colorCSR0512 in host |
queueCSR0 | intermediate queue used during the internal forward-BFS |
colorCSC512 | intermediate color map in 512bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host |
colorCSC32 | intermediate color map in 32bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host |
queueCSC | intermediate queue used during the internal backward-BFS |
queueCSR1 | intermediate queue which should be shared the same buffer with queueCSR0 |
component | return component buffer with the vertex label containing the lowest vertex id in the strongly connnected component containing that vertex |
triangleCount
#include "triangle_count.hpp"
template < int LEN, int ML > void triangleCount ( int numVertex, int numEdge, ap_uint <512>* offset0, ap_uint <512>* index0, ap_uint <512>* offset1, ap_uint <512>* index1, ap_uint <512>* offset2, ap_uint <512>* index2, uint64_t* triangles )
triangleCount the triangle counting algorithm is implemented, the input is the matrix in CSC format.
Parameters:
LEN | the depth of stream |
ML | URAM depth in the design |
numVertex | length of column offsets |
numEdge | length of row indices |
offset0 | column offsets (begin+end) value |
index0 | row indices value |
offset1 | column offsets (begin+end) value |
index1 | row indices value |
offset2 | 8 column offsets (begin+end) values |
index2 | 16 row indices values |
triangles | return triangle number |
twoHop
#include "twoHop.hpp"
void twoHop ( ap_uint <32> numPairs, ap_uint <64>* pair, unsigned* offsetOneHop, unsigned* indexOneHop, unsigned* offsetTwoHop, unsigned* indexTwoHop, unsigned* cnt_res )
twoHop this API can find the how many 2-hop pathes between two vertices. The input graph is the matrix in CSR format. And a list of src and destination pairs whose 2-hop pathes will be counted.
Parameters:
numPairs | How many pairs of source and destination vertices to be counted. |
pair | The source and destination of pairs are stored in this pointer. |
offsetOneHop | The CSR offset is stored in this pointer. |
indexOneHop | The CSR index is stored in this pointer. |
offsetTwoHop | The CSR offset is stored in this pointer. The graph should be replicated and stored here. This pointer is for an independent AXI port to increase performance. |
indexTwoop | The CSR index is stored in this pointer. The graph should be replicated and stored here. This pointer is for an independent AXI port to increase performance. |
cnt_res | The result of the twoHop API. The order of the result matches the order of the input source and destination pairs. |