QR_MUMPS
 All Classes Files Functions Variables Enumerations Enumerator Pages
dqrm_symbolic.F90
Go to the documentation of this file.
1 !! ##############################################################################################
2 !!
3 !! Copyright 2012 CNRS, INPT
4 !!
5 !! This file is part of qr_mumps.
6 !!
7 !! qr_mumps is free software: you can redistribute it and/or modify
8 !! it under the terms of the GNU Lesser General Public License as
9 !! published by the Free Software Foundation, either version 3 of
10 !! the License, or (at your option) any later version.
11 !!
12 !! qr_mumps is distributed in the hope that it will be useful,
13 !! but WITHOUT ANY WARRANTY; without even the implied warranty of
14 !! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 !! GNU Lesser General Public License for more details.
16 !!
17 !! You can find a copy of the GNU Lesser General Public License
18 !! in the qr_mumps/doc directory.
19 !!
20 !! ##############################################################################################
21 
22 
23 !! ##############################################################################################
33 
34 
35 #include "qrm_common.h"
36 
38 
39 
71 subroutine dqrm_symbolic(graph)
72 
73  use dqrm_spmat_mod
74  use qrm_adata_mod
75  use qrm_error_mod
76  use qrm_sort_mod
77  use qrm_common_mod
78  implicit none
79 
80  type(dqrm_spmat_type), target :: graph
81 
82  integer :: i, j, f, p, pp, ppp, root, node, roff, ne, np
83  integer :: first, c, ib, nlz, nth, leaves, totleaves
84  integer :: m, n, k, cyc, nb, fm, fn, fk
85  real(kind(1.d0)), allocatable :: n_weight(:), t_weight(:), lzero_w(:), proc_w(:)
86  real(kind(1.d0)) :: rm, rk, rn, totflops, smallth
87  integer, allocatable :: col_map(:), mark(:), stair(:), lzero(:), aux(:)
88  integer, pointer :: porder(:), rc(:), parent(:), fcol(:), fcol_ptr(:)
89  type(dqrm_spmat_type) :: g_csr
90  logical :: found
91  type(qrm_adata_type), pointer :: adata
92  integer(kind=8) :: hsize, rsize
93  ! error management
94  integer :: err_act
95  character(len=*), parameter :: name='qrm_symbolic'
96 
97  call qrm_err_act_save(err_act)
98 
99  ! just to simplify
100  adata => graph%adata
101  porder => adata%cperm
102  rc => adata%rc
103  parent => adata%parent
104 
105 
106  call qrm_aalloc(adata%fcol_ptr, adata%nnodes+1)
107  call qrm_aalloc(adata%fcol, sum(rc))
108  __qrm_check_ret(name,'qrm_aalloc',9999)
109 
110  ! just to simplify
111  fcol => adata%fcol
112  fcol_ptr => adata%fcol_ptr
113 
114 
115  ! first, determine the columns in each front.
116 
117  ! col_map(j)=f means that global column j is a principal variable in
118  ! front f
119  call qrm_aalloc(col_map, graph%n)
120  call qrm_aalloc(mark, adata%nnodes)
121  __qrm_check_ret(name,'qrm_aalloc2',9999)
122  mark = 0
123 
124  fcol_ptr(1:2)=1
125 
126  ! extract the first iteration in order to initialize fcol_ptr as
127  ! well
128  f = 1
129  do p=adata%cp_ptr(f), adata%cp_ptr(f+1)-1
130  j = porder(p)
131  col_map(j) = f
132  end do
133 
134  do f=2, adata%nnodes
135  fcol_ptr(f+1) = fcol_ptr(f)+max(rc(f-1),0)
136  do p=adata%cp_ptr(f), adata%cp_ptr(f+1)-1
137  j = porder(p)
138  col_map(j) = f
139  end do
140  end do
141 
142 #if defined(debug)
143  if(p .ne. graph%n+1) then
144  __qrm_prnt_dbg(.ne.'("Error in symbolic. i n ",i5,2x,i5)')p, graph%n
145  end if
146 #endif
147 
148  ! on input the graph is in csc format. we also need a csr
149  ! representation in order to determine the row-subtrees of A'*A
150  call dqrm_spmat_convert(graph, g_csr, 'csr', .false.)
151  __qrm_check_ret(name,'qrm_spmat_convert',9999)
152 
153  do i=1,g_csr%nz
154  g_csr%jcn(i) = adata%icperm(g_csr%jcn(i))
155  end do
156 
157  call sort_mat(g_csr)
158 
159  do i=1,g_csr%nz
160  g_csr%jcn(i) = adata%cperm(g_csr%jcn(i))
161  end do
162 
163 
164  ! for every front f, for every variable i in f, determine the
165  ! coefficients j in (A'*A)(i,1:i) and
166  cyc=0
167  do f=1, adata%nnodes
168  do p=adata%cp_ptr(f), adata%cp_ptr(f+1)-1
169  i = porder(p)
170 
171  ! the the coefficient j exists in row i of A'*A iff i and j
172  ! are both present in a row of A. So, for each nnz k in column j of A,
173  ! every nnz (k,j) in A defines a coefficient (i,j) in A'*A
174  fcol(fcol_ptr(f+1)) = i
175  fcol_ptr(f+1) = fcol_ptr(f+1)+1
176 
177  do pp = graph%jptr(i), graph%jptr(i+1)-1
178  k = graph%irn(pp)
179 
180  ! for every nnz k in column i, go along the corresponding
181  ! row and, in the tree, go up until node f (containing i)
182  do ppp=g_csr%iptr(k), g_csr%iptr(k+1)-1
183  j = g_csr%jcn(ppp)
184  if(adata%icperm(j) .ge. adata%icperm(i)) exit
185 
186  ! (i,j) is in tril(A'*A). Go up the tree until node f, for
187  ! every node met, add column i to the corresponding
188  ! front
189  node = col_map(j)
190  do
191  ! go up the tree
192  if((mark(node) .eq. i) .or. (node .eq. f)) exit
193 
194  fcol(fcol_ptr(node+1)) = i
195  fcol_ptr(node+1) = fcol_ptr(node+1)+1
196 
197  mark(node) = i
198  node = parent(node)
199  end do
200  end do
201  end do
202 
203  end do
204  end do
205  call qrm_adealloc(mark)
206  call qrm_aalloc(adata%nfrows, adata%nnodes)
207  __qrm_check_ret(name,'qrm_aalloc2.5',9999)
208 
209 
210  ! n_weight(i) is meant to hold the weight of front i
211  ! t_weight(i) is meant to hold the weight of the subtree rooted at i
212  call qrm_aalloc(n_weight, adata%nnodes)
213  call qrm_aalloc(t_weight, adata%nnodes)
214 
215  n_weight = 0.d0
216  t_weight = 0.d0
217 
218  call qrm_aalloc(stair, maxval(rc)+1)
219  __qrm_check_ret(name,'qrm_aalloc',9999)
220 
221  call qrm_get(graph, 'qrm_ib', ib)
222  call qrm_get(graph, 'qrm_nb', nb)
223 
224  hsize=0
225  rsize=0
226 
227  ! determine structure and weight of all nodes
228  do f=1, adata%nnodes
229 
230 #if defined(debug)
231  col_map=0
232 #endif
233 
234  ! under the assumption of postordered nodes, we can simply sweep
235  ! the list of fronts
236 
237  ! build the col_map for front f. col_map(k)=j means that global
238  ! column k is column j inside front f
239  do j=1, rc(f)
240  k = fcol(fcol_ptr(f)+j-1)
241  col_map(k)=j
242  end do
243 
244  if(f .eq. 1) then
245  roff = 1
246  else
247  roff = adata%stair(f-1)+1
248  end if
249 
250  stair(1:rc(f)) = 0
251  ! assemble the rows from the original matrix
252  do p=roff, adata%stair(f)
253  ! i is a row of the original matrix to be assembled into front f
254  i = adata%rperm(p)
255  ! sweep this row and determine its first coefficient
256  first = col_map(g_csr%jcn(g_csr%iptr(i)))
257  ! first = rc(f)+1
258  ! do pp=g_csr%iptr(i), g_csr%iptr(i+1)-1
259  ! j = col_map(g_csr%jcn(pp))
260  ! ! TODO: can be optimized id g_csr is sorted
261  ! if (j .lt. first) first=j
262  ! end do
263  stair(first) = stair(first)+1
264  end do
265 
266  ! assemble the CBs from the children
267  do ppp=adata%childptr(f), adata%childptr(f+1)-1
268  c = adata%child(ppp)
269 
270  ! ne is the number of Householder vectors computed on the
271  ! child c. np is the number of fully assembled pivots in c
272  ne = min(rc(c), adata%nfrows(c))
273  np = adata%cp_ptr(c+1)-adata%cp_ptr(c)
274 
275  ! count in all the rows on the CB of c
276  do i=np+1, ne
277  j = fcol(fcol_ptr(c)+i-1)
278  first = col_map(j)
279  stair(first) = stair(first)+1
280  end do
281  end do
282 
283  ! finalize stair
284  do i=2, rc(f)
285  stair(i) = stair(i)+stair(i-1)
286  end do
287 
288  adata%nfrows(f) = stair(rc(f))
289 
290  ! At this point it is possible to determine the computational
291  ! weight of node f
292  ne = min(rc(f),adata%nfrows(f))
293  do i=1, ne
294  n_weight(f) = n_weight(f)+qrm_count_flops(max(stair(i)-i+1,0),rc(f)-i,1,'panel')
295  n_weight(f) = n_weight(f)+qrm_count_flops(max(stair(i)-i+1,0),rc(f)-i,1,'update')
296  hsize = hsize+max(stair(i)-i+1,0)
297  end do
298 
299  ! do the same for R
300  np = adata%cp_ptr(f+1)-adata%cp_ptr(f)
301  rsize = rsize + np*(np+1)/2 + np*(rc(f)-np)
302 
303  t_weight(f) = t_weight(f)+n_weight(f)
304  p = parent(f)
305  if(p .ne. 0) t_weight(p) = t_weight(p)+t_weight(f)
306 
307  end do
308 
309  totflops = sum(n_weight)
310 
311  graph%gstats(qrm_e_facto_flops_) = floor(totflops,kind(graph%gstats(1)))
312  graph%gstats(qrm_e_nnz_r_) = rsize
313  graph%gstats(qrm_e_nnz_h_) = hsize
314 
315 #if defined(debug)
316  __qrm_prnt_dbg('("Total estimated number of MFLOPS: ",i10)')floor(totflops)
317 #endif
318 
319 
320  call dqrm_spmat_destroy(g_csr, all=.true.)
321  __qrm_check_ret(name,'qrm_spmat_destroy',9999)
322 
323  ! these are no more needed
324  call qrm_adealloc(col_map)
325  call qrm_adealloc(stair)
326  call qrm_aalloc(lzero, adata%nnodes)
327  call qrm_aalloc(adata%small, adata%nnodes)
328 
329  ! at this point we start going down the tree until we identify a set
330  ! of nodes such that the subtrees rooted at them can be scheduled to
331  ! threads with a good load balancing. Small nodes (or subtrees) will
332  ! be pruned away during the descent.
333 
334  call qrm_aalloc(lzero_w, adata%nnodes)
335  call qrm_aalloc(aux, adata%nnodes+2, lbnd=0)
336  call qrm_get(graph, 'qrm_nthreads', nth)
337 
338  if(nth .gt. adata%nnodes) nth = adata%nnodes ! you never know
339 
340  call qrm_aalloc(proc_w, nth)
341 
342 
343  smallth = 0.01
344 10 continue
345 
346  totleaves = 0
347  adata%small = 0
348 
349  ! goto 20
350  nlz = 0
351  ! initialize the l0 layer with the root nodes
352  do i=1, adata%nnodes
353  if(parent(i) .eq. 0) then
354  if(t_weight(i) .gt. smallth*totflops) then
355  nlz = nlz+1
356  lzero(nlz) = i
357  lzero_w(nlz) = t_weight(i)
358  else
359  adata%small(i) = 1 ! node is too smal; mark it
360  end if
361  end if
362  if(adata%childptr(i+1) .eq. adata%childptr(i)) totleaves = totleaves+1
363  end do
364 
365  leaves = 0
366 
367  ! start the loop
368  godown: do
369  if(nth .eq. 1) exit ! shortcut for serial execution
370  if(nlz .gt. nth*max(2.d0,(log(real(nth,kind(1.d0)))/log(2.d0))**2)) exit ! exit if already too many nodes in l0
371 
372  proc_w = 0.d0
373  ! sort the nodes in l0 in order of decreasing weight
374  call qrm_mergesort(nlz, lzero_w(1:nlz), aux(0:nlz+1), order=-1)
375  call qrm_mergeswap(nlz, aux(0:nlz+1), lzero(1:nlz), lzero_w(1:nlz))
376 
377  ! map subtrees to threads round-robin
378  do i=1, nlz
379  ! find the least loaded proc
380  p = minloc(proc_w,1)
381  proc_w(p) = proc_w(p) + lzero_w(i)
382  end do
383 
384  ! all the subtrees have been mapped. Evaluate load balance
385  rm = minval(proc_w)/maxval(proc_w)
386  ! write(*,*)'==>',lzero_w(1:nlz)
387  ! write(*,*)'-->',nth,nlz,rm,leaves,totleaves
388  if((rm .gt. 0.9) .and. (nlz .ge. 1*nth)) exit ! if balance is higher than 90%, we're happy
389 
390  ! if load is not balanced, replace heaviest node with its kids (if any)
391  found = .false.
392  findn: do
393  if(leaves .eq. totleaves) exit godown
394 
395  if(leaves .eq. nlz) then
396  if(nlz .ge. nth*max(2.d0,(log(real(nth,kind(1.d0)))/log(2.d0))**2)) then
397  exit godown ! all the nodes in l0 are leaves. nothing to do
398  else
399  smallth = smallth/2.d0
400  if(smallth .lt. 0.0001) then
401  exit godown
402  else
403  goto 10
404  end if
405  end if
406  end if
407  n = lzero(leaves+1) ! n is the node that must be replaced
408 
409  ! appends children of n
410  do p=adata%childptr(n), adata%childptr(n+1)-1
411  c = adata%child(p)
412  if(t_weight(c) .gt. smallth*totflops) then
413  ! this child is big enough, add it
414  found = .true.
415  nlz = nlz+1
416  lzero(nlz) = c
417  lzero_w(nlz) = t_weight(c)
418  else
419  adata%small(c) = 1 ! node is too smal; mark it
420  end if
421  end do
422  if(found) exit findn ! if at least one child was added then we redo the mapping
423  leaves = leaves+1
424  end do findn
425 
426  ! swap n with last element
427  lzero(leaves+1) = lzero(nlz)
428  lzero_w(leaves+1) = lzero_w(nlz)
429  nlz = nlz-1
430 
431  end do godown
432 
433  ! mark all the children of nodes in l0
434  do i=1, nlz
435  n = lzero(i)
436  do p=adata%childptr(n), adata%childptr(n+1)-1
437  c = adata%child(p)
438  adata%small(c) = 1
439  end do
440  end do
441 
442 20 continue
443 
444  t_weight = t_weight/totflops * 100
445  ! call qrm_print_tree('atree.dot',adata, t_weight)
446 
447 
448  call qrm_adealloc(lzero)
449  call qrm_adealloc(lzero_w)
450  call qrm_adealloc(proc_w)
451  call qrm_adealloc(aux)
452  call qrm_adealloc(n_weight)
453  call qrm_adealloc(t_weight)
454 
455  call qrm_err_act_restore(err_act)
456  return
457 
458 9999 continue ! error management
459  call qrm_err_act_restore(err_act)
460  if(err_act .eq. qrm_abort_) then
461  call qrm_err_check()
462  end if
463 
464  return
465 
466 
467 contains
468 
469  subroutine sort_mat(mat)
470  use qrm_sort_mod
471  implicit none
472 
473  type(dqrm_spmat_type) :: mat
474  integer, allocatable :: aux(:)
475  integer :: i, n, k1, k2
476 
477 
478 
479  allocate(aux(0:mat%n+1))
480 
481  do i=1,mat%m
482  k1= mat%iptr(i)
483  k2= mat%iptr(i+1)-1
484  n = k2-k1+1
485  call qrm_mergesort(n, mat%jcn(k1:k2), aux(0:n+1))
486  call qrm_mergeswap(n, aux(0:n+1), mat%jcn(k1:k2))
487  end do
488 
489  deallocate(aux)
490 
491 
492  return
493  end subroutine sort_mat
494 
495 end subroutine dqrm_symbolic
This module contains all the error management routines and data.
subroutine dqrm_symbolic(graph)
This subroutine computes the symbolic QR factorization of a matrix.
Generif interface for the ::dqrm_pgeti, ::dqrm_pgetr and.
This module contains the interfaces of all non-typed routines.
subroutine qrm_err_act_restore(err_act)
Restores the value of the qrm_err_act variable.
This module contains the definition of the analysis data type.
This module contains the definition of the basic sparse matrix type and of the associated methods...
subroutine qrm_err_check()
This subroutine checks the errors stack. If something is found all the entries in the stack are poppe...
subroutine sort_mat(mat)
The main data type for the analysis phase.
subroutine dqrm_spmat_destroy(qrm_spmat, all)
This subroutine destroyes a qrm_spmat instance.
subroutine dqrm_spmat_convert(in_mat, out_mat, fmt, values)
This subroutine converts an input matrix into a different storage format. Optionally the values may b...
Generic interface for the ::qrm_count_realflops ::qrm_count_pureflops.
This type defines the data structure used to store a matrix.
int i
Definition: secs.c:40
This module contains routines for sorting.
subroutine qrm_err_act_save(err_act)
Saves a copy of the qrm_err_act variable.