namespace graph - 2023.1 English

Vitis Libraries

Release Date
2023-12-20
Version
2023.1 English
// namespaces

namespace xf::graph::internal
    namespace xf::graph::internal::bfs
    namespace xf::graph::internal::calc_degree
    namespace xf::graph::internal::connected_components
    namespace xf::graph::internal::convert_csr_csc
    namespace xf::graph::internal::diameter
    namespace xf::graph::internal::hash_group_aggregate
    namespace xf::graph::internal::label_propagation
    namespace xf::graph::internal::mis
    namespace xf::graph::internal::mssp
    namespace xf::graph::internal::mst
    namespace xf::graph::internal::pagerank
    namespace xf::graph::internal::pagerankMultiChannel
    namespace xf::graph::internal::scc
    namespace xf::graph::internal::sssp
        namespace xf::graph::internal::sssp::nopred
        namespace xf::graph::internal::sssp::pred
    namespace xf::graph::internal::triangle_count
namespace xf::graph::merge

bfs

#include "bfs.hpp"
template <int MAXOUTDEGREE>
void bfs (
    const int srcID,
    const int numVertex,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* color512,
    ap_uint <32>* dtime,
    ap_uint <32>* ftime,
    ap_uint <32>* pred,
    ap_uint <32>* distance
    )

bfs Implement the directed graph traversal by breath-first search algorithm

Parameters:

MAXOUTDEGREE the maximum outdegree of input graphs. Large value will result in more URAM usage.
srcID the source vertex ID for this search, starting from 0
numVertex vertex number of the input graph
indexCSR column index of CSR format
offsetCSR row offset of CSR format
color512 intermediate color map which should be shared with dtime, pred or distance in host.
queue32 intermediate queue32 used during the BFS
dtime the result of discovery time stamp for each vertex
ftime the result of finish time stamp for each vertex
pred the result of parent index of each vertex
distance the distance result from given source vertex for each vertex

calcuDegree

#include "calc_degree.hpp"
template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void calcuDegree (
    int numVertex,
    int numEdge,
    ap_uint <512>* index,
    ap_uint <512>* degree
    )

calculate degree algorithm is implemented

Parameters:

MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex CSR/CSC data vertex number
numEdge CSR/CSC data edge number
index input CSR/CSC data index array
degree output degree array

calcuWeightedDegree

calcuWeightedDegree overload (1)

#include "calc_degree.hpp"
template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void calcuWeightedDegree (
    int numVertex,
    int numEdge,
    ap_uint <512> index [MAXEDGE],
    ap_uint <512> weight [MAXEDGE],
    ap_uint <512> degree [MAXVERTEX]
    )

calculate weighted degree algorithm is implemented

Parameters:

MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex CSR/CSC data vertex number
numEdge CSR/CSC data edge number
index input CSR/CSC data index array
weight input CSR/CSC data weight array, default float type.
degree output degree array, default float type.

calcuWeightedDegree overload (2)

#include "calc_degree.hpp"
template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int unrollbin,
    int CHANNELNUM
    >
void calcuWeightedDegree (
    int numVertex,
    int numEdge,
    int numEdgePerChannel [CHANNELNUM],
    ap_uint <512> index [MAXEDGE],
    ap_uint <512> weight [MAXEDGE],
    ap_uint <512> degree [MAXVERTEX]
    )

calculate weighted degree algorithm is implemented

Parameters:

MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
unrollbin log 2 of data numbers per buff, for buffType is 512b, and data type is double(64b), so there is 8 data in one buff, 8=2^3, the unrollbin is 3
CHANNELNUM pingpong channel number for the design, 2 channel will take 14 persudo channels of HBM while 6 channel will take 26 persudo channels of HBM in multi-channel pagerank
numVertex CSR/CSC data vertex number
numEdge CSR/CSC data edge number
numEdgePerChannel the number of edge per channel to deal with
index input CSR/CSC data index array
weight input CSR/CSC data weight array, default float type.
degree output degree array, default float type.

connectedComponents

#include "connected_components.hpp"
template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int MAXOUTDEGREE
    >
void connectedComponents (
    const int numEdge,
    const int numVertex,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC512,
    ap_uint <32>* indexCSC32,
    ap_uint <512>* offsetCSCTmp1,
    ap_uint <512>* offsetCSCTmp2,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* component512,
    ap_uint <32>* component32
    )

connectedComponents Compute the connected component membership of each vertex only for undirected graph

Parameters:

MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE The log2 of number of data in each cache line (512bit), for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
MAXOUTDEGREE the max sum of indegree and outdegree of the input graph supported. Large value will result in more URAM usage
numEdge edge number of the input graph
numVertex vertex number of the input graph
indexCSR column index of CSR format
offsetCSR row offset of CSR format
indexCSC512 column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host
indexCSC32 column index of CSC format in 32bit, which should be shared the same buffer with indexCSC512 in host
offsetCSC row offset of CSC format
offsetCSCTmp1 temp row offset for CSR2CSC convert
offsetCSCTmp2 temp row offset of CSR2CSC convert
queue32 intermediate queue32 used during the internal BFS
component512 Same as component32 but in 512bit, which shoude be shared the same buffer with component32 in host
component32 return result buffer with the vertex label containing the lowest vertex id in the connnected component containing that vertex

convertCsrCsc

#include "convert_csr_csc.hpp"
template <
    typename DT,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE
    >
void convertCsrCsc (
    int numEdge,
    int numVertex,
    ap_uint <512>* offsetIn,
    ap_uint <512>* indexIn,
    ap_uint <512>* offsetOut,
    DT* indexOut,
    ap_uint <512>* offsetTmp0,
    ap_uint <512>* offsetTmp1
    )

convert Csr Csc algorithm is implemented

Parameters:

DT data type of the input and output, [float, double]
MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numEdge CSR/CSC data indices number
numVertex CSR/CSC data offsets number
indexIn original CSR/CSC data indice array
offsetIn original CSR/CSC data offset array
indexOut output transfered CSC/CSR data indice array
offsetOut output transfered CSC/CSR data offset array
offsetTmp0 internal temporary CSC/CSR data offset array
offsetTmp1 internal temporary CSC/CSR data offset array

denseSimilarityCoeffs

#include "dense_similarity_coeffs.hpp"
template <
    int CHNM,
    int WData,
    int RAM_SZ,
    int MAXK
    >
void denseSimilarityCoeffs (
    ap_int <32>* config,
    ap_int <32>* sourceWeight,
    ap_int <32>* sourceCoeffs,
    ap_int <32*CHNM>* dataIn00,
    ap_int <32*CHNM>* dataIn01,
    ap_int <32*CHNM>* dataIn02,
    ap_int <32*CHNM>* dataIn03,
    ap_int <32*CHNM>* dataIn10,
    ap_int <32*CHNM>* dataIn11,
    ap_int <32*CHNM>* dataIn12,
    ap_int <32*CHNM>* dataIn13,
    ap_int <32*CHNM>* dataIn20,
    ap_int <32*CHNM>* dataIn21,
    ap_int <32*CHNM>* dataIn22,
    ap_int <32*CHNM>* dataIn23,
    ap_int <32>* resultID,
    float* similarity
    )

similarity function for dense graph. It support both Jaccard and Cosine Similarity.

Parameters:

CHNM the channel number of input data
WData the width of input data
RAM_SZ the log size of internal URAM
MAXK the max supporting number of insert sort function
config the control parameter of the primitive which contains: sourceNUM, similarityType, dataType, startID, rowNUM and colNUM of each processing unit(PU)
sourceWeight input weight as source vertex for computing similarity
sourceCoeffs input coefficient as a scale factor for the source vertex
dataIn00 input muti-channel data from HBM0 for PU0
dataIn01 input muti-channel data from HBM1 for PU0
dataIn02 input muti-channel data from HBM2 for PU0
dataIn03 input muti-channel data from HBM3 for PU0
dataIn10 input muti-channel data from HBM4 for PU1
dataIn11 input muti-channel data from HBM5 for PU1
dataIn12 input muti-channel data from HBM6 for PU1
dataIn13 input muti-channel data from HBM7 for PU1
dataIn20 input muti-channel data from HBM8 for PU2
dataIn21 input muti-channel data from HBM9 for PU2
dataIn22 input muti-channel data from HBM10 for PU2
dataIn23 input muti-channel data from HBM11 for PU2
rowID output result ID
similarity output similarity value corresponding to its ID

estimated_diameter

#include "diameter.hpp"
void estimated_diameter (
    unsigned numVert,
    unsigned numEdge,
    unsigned* offset,
    unsigned* column,
    float* weight,
    float* max_dist,
    unsigned* src,
    unsigned* des
    )

diameter estimate based on the sssp algorithm

Parameters:

numVert vertex number of the input graph
numEdge edge number of the input graph
offset row offset of CSR format
column column index of CSR format
weight weight value of CSR format
max_distance the result of max distance
src the result of source vertex
des the result of destination vertex

labelPropagation

#include "label_propagation.hpp"
void labelPropagation (
    int numEdge,
    int numVertex,
    int numIter,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* pingHashBuf,
    ap_uint <512>* pongHashBuf,
    ap_uint <512>* labelPing,
    ap_uint <512>* labelPong
    )

labelPropagation the label propagation algorithm is implemented

Parameters:

numEdge edge number of the graph
numVertex vertex number of the graph
numIter iteration number
indexCSR column index of CSR format
offsetCSR row offset of CSR format
indexCSC row index of CSC format
offsetCSC column of CSC format
labelPing label ping buffer
labelPong label pong buffer

kernelLouvainTop

#include "louvain_coloring.hpp"
void kernelLouvainTop (
    int64_t* config0,
    DWEIGHT* config1,
    ap_uint <CSRWIDTHS>* offsets,
    ap_uint <CSRWIDTHS>* indices,
    ap_uint <CSRWIDTHS>* weights,
    ap_uint <COLORWIDTHS>* colorAxi,
    ap_uint <COLORWIDTHS>* colorInx,
    ap_uint <DWIDTHS>* cidPrev,
    ap_uint <DWIDTHS>* cidSizePrev,
    ap_uint <DWIDTHS>* totPrev,
    ap_uint <DWIDTHS>* cidCurr,
    ap_uint <DWIDTHS>* cidSizeCurr,
    ap_uint <DWIDTHS>* totCurr,
    ap_uint <DWIDTHS>* cidSizeUpdate,
    ap_uint <DWIDTHS>* totUpdate,
    ap_uint <DWIDTHS>* cWeight,
    ap_uint <CSRWIDTHS>* offsetsDup,
    ap_uint <CSRWIDTHS>* indicesDup,
    ap_uint <8>* flag,
    ap_uint <8>* flagUpdate
    )

Level2: louvain kernel implement.

Parameters:

config0 table for the int64 parameter of louvain, numVertex, numColor, total iteration times and so on.
config1 table for the double parameter of louvain, epsilon, constant_recip, modularity and so on.
offsets CSR data offset array.
indices CSR data index array.
weights CSR data weight array, support type float.
colorAxi color result before ordered.
colorInx color result of vertexes before louvain phase.
cidPrev pingpang buffer of community id (cid) for all vertex
cidSizePrev pingpang buffer of size of community for all cid
totPrev pingpang buffer of tot of community for all cid
cidCurr pingpang buffer of community id (cid) for all vertex
cidSizeCurr pingpang buffer of size of community for all cid
totCurr pingpang buffer of tot of community for all cid
cidSizeUpdate buffer for update
totUpdate buffer for update
cWeight community weight of all cid
offsetsDup duplicate of offset for fast pruning
indicesDup duplicate of index for fast pruning
flag pingpang buffer of pruning flag
flagUpdate pingpang buffer of pruning flag

mis

#include "maximal_independent_set.hpp"
template <int MAX_ROUND>
void mis (
    int m,
    const int* offset,
    const int* indices,
    int* C_group_0,
    int* C_group_1,
    int* S_group_0,
    int* S_group_1,
    int* res_out
    )

mis Implement the algorithm which for finding a maximal set of independent vertices

Parameters:

MAX_ROUND the max round number for mis algorithm iterations
m the vertex number of current graph
offset offset for undirect-graph
indices indices for undirect-graph
C_group_0 status field, C-0
C_group_1 status field, C-1
S_group_0 status field, S-0
S_group_1 status field, S-1
res_out selected mis result(output)

merge_kernel_core

#include "merge.hpp"
void merge_kernel_core (
    int num_v,
    int num_e,
    int num_c_out,
    int* num_e_out,
    DF_V_T* offset_in,
    DF_D_T* edges_in,
    DF_W_T* weights_in,
    DF_V_T* c,
    DF_V_T* offset_out,
    DF_D_T* edges_out,
    DF_W_T* weights_out,
    DF_V_T* count_c_single,
    DF_V_T* jump,
    DF_V_T* count_c,
    DF_V_T* index_c
    )

In graph algorithm, sometimes we need to merge a large graph to a small graph based on specific sequence.

Parameters:

num_v input vertex number, read only and the depth is 1
num_e input edge number. read only and the depth is 1
num_c_out output vertex number, read only and the depth is 1
num_e_out output edge number, write only and the depth is 1
offset_in input vertex offset, read only and the depth is NV
edges_in edges for each input vertex, read only and the depth is NE
weights_in weight for each input edge, read only and the depth is NE
c map the original vertex id to merged vertex, read only and the depth is NV
offset_out output vertex offset, write only and the depth is NV
edges_out edges for each output vertex, write only and the depth is NE
weights_out weight for each output edge, write only and the depth is NE
count_c_single get the number of each c, read / write and the depth is NC
jump random vertex access sequence, read / write and the depth is NV
count_c aggregate offset for the c based on count_c_single, read / write and the depth is NC
index_c get the id for the c in same cid, read / write and the depth is NV

mst

#include "mst.hpp"
void mst (
    unsigned int allVert,
    unsigned int allEdge,
    unsigned int source,
    unsigned int* offset,
    unsigned int* column,
    float* weight,
    unsigned* mstRes
    )

minimum spanning tree based on the Prim algorithm

Parameters:

allVert vertex number of the input graph
allEdge edge number of the input graph
source starting point of the Prim algorithm
offset row offset of CSR format
column column index of CSR format
weight weight value of CSR format
mstRes the result of the MST. To get the parent vertex of a vertex with ID V in the generated tree, parent=mstRes[V].

pageRankTop

pageRankTop overload (1)

#include "pagerank.hpp"
template <
    typename T,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2UNROLL,
    int WIDTHOR,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINECORE,
    int LOG2DATAPERCACHELINEDEGREE,
    int RAMTYPE
    >
void pageRankTop (
    int numVertex,
    int numEdge,
    ap_uint <512>* degreeCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* weightCSC,
    ap_uint <512>* cntValFull,
    ap_uint <512>* buffPing,
    ap_uint <512>* buffPong,
    ap_uint <WIDTHOR>* orderUnroll,
    int* resultInfo,
    T randomProbability = 1.0,
    T alpha = 0.85,
    T tolerance = 1e-4,
    int numIter = 200
    )

pagerank algorithm is implemented

Parameters:

T date type of pagerank, double or float
MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2UNROLL log2 of unroll number, due to DDR limit, best LOG2UNROLL is 3
WIDTHOR order array bandwidth, it’s 256 in our case
LOG2CACHEDEPTH log2(cache depth), the onchip memory for cache is 512 bit x CACHEDEPTH (512 bit x 2^LOG2CACHEDEPTH)
LOG2DATAPERCACHELINECORE param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
LOG2DATAPERCACHELINEDEGREE param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
numVertex CSR/CSC data offsets number
numEdge CSR/CSC data indices number
degreeCSR temporary internal degree value
offsetCSC CSR/CSC data offset array
indexCSC CSR/CSC data indice array
weightCSC CSR/CSC data weight array, support type float
cntValFull temporary internal initialized mulplier values, length equals to numVertex
buffPing ping array to keep temporary pagerank value
buffPong pong array to keep temporary pagerank value
orderUnroll temporary internal order array to keep initialized offset values
resultInfo The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations.
randomProbability initial PR value, normally 1.0 or 1.0/numVertex
alpha damping factor, normally 0.85
tolerance converge tolerance
numIter max iteration

pageRankTop overload (2)

#include "pagerank_multi_channels.hpp"
template <
    typename T,
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2UNROLL,
    int WIDTHOR,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINECORE,
    int LOG2DATAPERCACHELINEDEGREE,
    int RAMTYPE
    >
void pageRankTop (
    int numVertex,
    int numEdge,
    int nsource,
    ap_uint <32>* sourceID,
    ap_uint <512>* degreeCSR,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indexCSC,
    ap_uint <512>* weightCSC,
    ap_uint <512>* cntValFull0,
    ap_uint <512>* buffPing0,
    ap_uint <512>* buffPong0,
    ap_uint <512>* cntValFull1,
    ap_uint <512>* buffPing1,
    ap_uint <512>* buffPong1,
    ap_uint <WIDTHOR>* orderUnroll,
    int* resultInfo,
    T randomProbability = 1.0,
    T alpha = 0.85,
    T tolerance = 1e-4,
    int numIter = 200
    )

pagerankMultiChannel algorithm is implemented support: 1. HBM based board

  1. double / float for PR value calculate
  2. weighted / unweighted graph / personalized graph
  3. 2 channel / 6 channel, 2 channel will take 14 persudo channels of HBM while 6 channel will take 26 persudo channels of HBM

Parameters:

T date type of pagerank, double or float
MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2UNROLL log2 of unroll number for float adder
WIDTHOR order array bandwidth, it’s 256 in our case
LOG2CACHEDEPTH log2(cache depth), the onchip memory for cache is 512bits x CACHEDEPTH (512 bits x 2^LOG2CACHEDEPTH)
LOG2DATAPERCACHELINECORE param for module pageRankCore, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
LOG2DATAPERCACHELINEDEGREE param for module calduDegree, log2 of number of data in one 512bit (64 byte), for double, it’s log2(64/sizeof(double)) = 3, for float, it’s log2(64/sizeof(float)) = 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2: BRAM
CHANNEL_NUM pingpong channel number for the design
numVertex CSR/CSC data offsets number
numEdge CSR/CSC data indices number
nsource source vertex ID number, 0 means not apply pagerank_personalized
sourceID source vertex ID array for pagerank_personalized
degreeCSR temporary internal degree value
offsetCSC CSR/CSC data offset array
indexCSC CSR/CSC data indice array
weightCSC CSR/CSC data weight array, support type float
cntValFull0 temporary internal initialized mulplier values, length equals to numVertex
buffPing0 ping array to keep temporary pagerank value
buffPong0 pong array to keep temporary pagerank value
cntValFull1 temporary internal initialized mulplier values, length equals to numVertex
buffPing1 ping array to keep temporary pagerank value
buffPong1 pong array to keep temporary pagerank value
orderUnroll temporary internal order array to keep initialized offset values
resultInfo The output information. resultInfo[0] is isResultinPong, resultInfo[1] is iterations.
randomProbability initial PR value, normally 1.0 or 1.0/numVertex
alpha damping factor, normally 0.85
tolerance converge tolerance
numIter max iteration

renumberCore

#include "renumber.hpp"
void renumberCore (
    int32_t NV,
    int32_t& numClusters,
    ap_int < (32)>* oldCids,
    ap_int < (32)>* mapCid0,
    ap_int < (32)>* mapCid1,
    ap_int < (32)>* newCids
    )

Renumbering recode the categorized graph’s table, and it support 64M for input.

Parameters:

NV the size of vertices.
numClusters the size of renumbered.
oldCids the input table for vertices.
mapCid0 the intermediate memory for new value, and the memory is writen directly after renumbering.
mapCid1 the duplicate memory for mapCid0
newCids the output for renumbering, the value is recoded from 0 to numClusters.

singleSourceShortestPath

singleSourceShortestPath overload (1)

#include "shortest_path.hpp"
template <
    int WIDTH,
    int MAXOUTDEGREE
    >
void singleSourceShortestPath (
    ap_uint <32>* config,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* weightCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* distance512,
    ap_uint <WIDTH>* distance32,
    ap_uint <512>* pred512,
    ap_uint <32>* pred32,
    ap_uint <8>* info
    )

singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.

Parameters:

WIDTH date width of the weight and the result distance
MAXOUTDEGREE The max out put degree of the input graph supported. Large max out degree value
config The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(1) is the path switch, config[4].get_bit(2) is the fixed/float switch. config[5] is sourceID.
offsetCSR The offsetCSR buffer that stores the offsetCSR data in CSR format
indexCSR The indexCSR buffer that stores the indexCSR dada in CSR format
weightCSR The weight buffer that stores the weight data in CSR format
queue512 The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32.
queue32 The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512.
distance512 The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
distance32 The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
pred512 The predecessor data. The width is 512. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port.
pred32 The predecessor data is stored here. When allocating buffers, pred512 and pred32 should point to the same buffer. And please bundle pred512 and pred32 to the same gmem port.
info The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree.

singleSourceShortestPath overload (2)

#include "shortest_path.hpp"
template <
    int WIDTH,
    int MAXOUTDEGREE
    >
void singleSourceShortestPath (
    ap_uint <32>* config,
    ap_uint <512>* offsetCSR,
    ap_uint <512>* indexCSR,
    ap_uint <512>* weightCSR,
    ap_uint <512>* queue512,
    ap_uint <32>* queue32,
    ap_uint <512>* distance512,
    ap_uint <WIDTH>* distance32,
    ap_uint <8>* info
    )

singleSourceShortestPath the single source shortest path algorithm is implemented, the input is the matrix in CSR format.

Parameters:

WIDTH date width of the weight and the result distance
MAXOUTDEGREE The max out put degree of the input graph supported. Large max out degree value
config The config data. config[0] is the number of vertices in the graph. config[1] is the lower 32bit of the max distance value. config[2] is the higher 32bit of the max distance value. config[3] is the depth of the queue. config[4] is the option for the sssp: config[4].get_bit(0) is the weight/unweight switch, config[4].get_bit(2) is the fixed/float switch. config[5] is the sourceID.
offsetCSR The offsetCSR buffer that stores the offsetCSR data in CSR format
indexCSR The indexCSR buffer that stores the indexCSR dada in CSR format
weightCSR The weight buffer that stores the weight data in CSR format
queue512 The shortest path requires a queue. The queue will be stored here in 512bit. Please allocate the same buffer with queue32 and budle to the same gmem port with queue32.
queue32 The shortest path requires a queue. The queue will be stored here in 32bit. Please allocate the same buffer with queue512 and budle to the same gmem port with queue512.
distance512 The distance32 data. The width is 512. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
distance32 The distance32 data is stored here. When allocating buffers, distance512 and distance32 should point to the same buffer. And please bundle distance512 and distance32 to the same gmem port.
info The debug information. info[0] shows if the queue overflow. info[1] shows if the precessed graph exceeds the supported maxoutdegree.

stronglyConnectedComponents

#include "strongly_connected_components.hpp"
template <
    int MAXVERTEX,
    int MAXEDGE,
    int LOG2CACHEDEPTH,
    int LOG2DATAPERCACHELINE,
    int RAMTYPE,
    int MAXOUTDEGREE
    >
void stronglyConnectedComponents (
    const int edgeNum,
    const int vertexNum,
    ap_uint <512>* offsetCSR0,
    ap_uint <512>* indexCSR0,
    ap_uint <512>* offsetCSC,
    ap_uint <512>* indxeCSC512,
    ap_uint <32>* indexCSC32,
    ap_uint <512>* offsetCSR1,
    ap_uint <512>* indexCSR1,
    ap_uint <512>* offsetCSCTmp1,
    ap_uint <512>* offsetCSCTmp2,
    ap_uint <512>* colorCSR0512,
    ap_uint <32>* colorCSR032,
    ap_uint <32>* queueCSR0,
    ap_uint <512>* colorCSC512,
    ap_uint <32>* colorCSC32,
    ap_uint <32>* queueCSC,
    ap_uint <32>* queueCSR1,
    ap_uint <32>* component
    )

stronglyConnectedComponents Compute the strongly connected component membership of each vertex only for directed graph, and label each vertex with one value containing the lowest vertex id in the SCC containing that vertex.

Parameters:

MAXVERTEX CSC/CSR data vertex(offset) array maxsize
MAXEDGE CSC/CSR data edge(indice) array maxsize
LOG2CACHEDEPTH cache depth in Binary, the cache onchip memory is 512 bit x uramRow
LOG2DATAPERCACHELINE number of data in one 512bit in Binary, for double, it’s 3, for float, it’s 4
RAMTYPE flag to tell use URAM LUTRAM or BRAM, 0 : LUTRAM, 1 : URAM, 2 : BRAM
MAXOUTDEGREE the max indegree or outdegree of the input graph supported. Large value will result in more URAM usage
edgeNum edge number of the input graph
vertexNum vertex number of the input graph
indexCSR0 column index of CSR format
offsetCSR0 row offset of CSR format
indxeCSC512 column index of CSC format in 512bit, which should be shared the same buffer with indexCSC32 in host
indexCSC32 column index of CSC format in 32bit, which should be shared the same buffer with indxeCSC512 in host
offsetCSC row offset of CSC format
indexCSR1 column index of CSR format, which should be shared the same buffer with indexCSR0 in host
offsetCSR1 row offset of CSR format, which should be shared the same buffer with offsetCSR0 in host
offsetCSCTmp1 temp row offset for CSR2CSC convert
offsetCSCTmp2 temp row offset of CSR2CSC convert
colorCSR0512 intermediate color map in 512bit for forward-BFS
colorCSR032 intermediate color map in 32bit for forward-BFS, which should be shared the same buffer with colorCSR0512 in host
queueCSR0 intermediate queue used during the internal forward-BFS
colorCSC512 intermediate color map in 512bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host
colorCSC32 intermediate color map in 32bit for backward-BFS, which should be shared the same buffer with colorCSR0512 in host
queueCSC intermediate queue used during the internal backward-BFS
queueCSR1 intermediate queue which should be shared the same buffer with queueCSR0
component return component buffer with the vertex label containing the lowest vertex id in the strongly connnected component containing that vertex

triangleCount

#include "triangle_count.hpp"
template <
    int LEN,
    int ML
    >
void triangleCount (
    int numVertex,
    int numEdge,
    ap_uint <512>* offset0,
    ap_uint <512>* index0,
    ap_uint <512>* offset1,
    ap_uint <512>* index1,
    ap_uint <512>* offset2,
    ap_uint <512>* index2,
    uint64_t* triangles
    )

triangleCount the triangle counting algorithm is implemented, the input is the matrix in CSC format.

Parameters:

LEN the depth of stream
ML URAM depth in the design
numVertex length of column offsets
numEdge length of row indices
offset0 column offsets (begin+end) value
index0 row indices value
offset1 column offsets (begin+end) value
index1 row indices value
offset2 8 column offsets (begin+end) values
index2 16 row indices values
triangles return triangle number

twoHop

#include "twoHop.hpp"
void twoHop (
    ap_uint <32> numPairs,
    ap_uint <64>* pair,
    unsigned* offsetOneHop,
    unsigned* indexOneHop,
    unsigned* offsetTwoHop,
    unsigned* indexTwoHop,
    unsigned* cnt_res
    )

twoHop this API can find the how many 2-hop pathes between two vertices. The input graph is the matrix in CSR format. And a list of src and destination pairs whose 2-hop pathes will be counted.

Parameters:

numPairs How many pairs of source and destination vertices to be counted.
pair The source and destination of pairs are stored in this pointer.
offsetOneHop The CSR offset is stored in this pointer.
indexOneHop The CSR index is stored in this pointer.
offsetTwoHop The CSR offset is stored in this pointer. The graph should be replicated and stored here. This pointer is for an independent AXI port to increase performance.
indexTwoop The CSR index is stored in this pointer. The graph should be replicated and stored here. This pointer is for an independent AXI port to increase performance.
cnt_res The result of the twoHop API. The order of the result matches the order of the input source and destination pairs.