Special Topics in CFD

0 downloads 0 Views 13MB Size Report
N. 1n i)(L i n n. S. POD δ. ))(),(( where. )( ))(),((u. -)(u. N. 1 min arg. V. 2 s. S. 2 .... left plot in Figure 2.3 refer to the velocity (top) and pressure (bottom) fields computed with the ..... V0 vehicle velocity ψo maximum non-dimensional pressure coefficient of the fan ...... On a superscalar processor, portions of these four statements.
1

CFD Open Series Revision: 1.85.1

a

Special Topics in CFD Ideen Sadrehaghighi,

Ph.D.

HVAC

Errors Analysis

Uncertainty Quantification

CFD in Biomedical Applications

Smooth Particle Hydrodynamics

Meshless Scehmes

Reduced Order Modeling

ANNAPOLIS, MD

Enviromental Control

2

Contents 1

Introduction ................................................................................................................................ 13

2

Reduced Order Modeling (ROM) ......................................................................................... 17

3

Computational Error and Uncertainty Quantification ................................................. 31

Computational Predictively plus Verification & Validation............................................................... 13 Multiscale/Multiphysics:.................................................................................................................................. 14 Mesh Free Methods for CFD............................................................................................................................ 14 Integrated Simulations of Complex Systems ........................................................................................... 15

Various Techniques ............................................................................................................................................ 18 Common Features Shared by Reduced Order Methods (ROM) ....................................................... 18 Reduced Basis Methods .................................................................................................................................... 19 Lagrange ............................................................................................................................ 19 Hermit ................................................................................................................................ 19 Taylor ................................................................................................................................. 19 Snapshot Sets .................................................................................................................... 19 Proper Orthogonal Decomposition (POD) Spaces ................................................................................. 20 Galerkin Projection into POD Space .................................................................................. 21 Case Study - Vortex shedding around a circular cylinder using a POD-Galerkin Method . 21 2.4.2.1 Governing Equations ............................................................................................... 21 2.4.2.2 Details of the Full Order Simulation ........................................................................ 22 2.4.2.3 Details of the ROM Simulation ................................................................................ 23 2.4.2.4 Analysis of the Results ............................................................................................. 23 Addressing Challenges in Reduced-Order Modeling ............................................................................ 24 Reduced Order CFD Simulation..................................................................................................................... 25 Case Study 1 - Designing Parameters of Test Stage Axial Turbine ................................................. 27 Blade Reverse Engineering as applied to Geometry Definition ........................................ 28 3D Aerodynamic Computation .......................................................................................... 28 Case Study 2 - Cooling Air Flow Rate .......................................................................................................... 29 Reduced Order Model using Empirical Relationship ........................................................................... 29

Classification of Errors...................................................................................................................................... 32 Physical Modeling Error ................................................................................................................................... 32 Geometrical Modeling Errors ......................................................................................................................... 34 Spatial Discretization (Governing Equations) Errors .......................................................................... 35 Higher Order Discretization ............................................................................................... 36 Discretization Errors ......................................................................................................................................... 37 Mesh Density ..................................................................................................................... 37 Grid Independence Study .................................................................................................. 37 Grid Topology .................................................................................................................... 38 Sources of Discretization Error .......................................................................................... 39 Case Study – Hypersonic Flow over an Axisymmetric Sphere-Cone ................................. 40 Estimating Discretization Error .......................................................................................... 40 3.5.6.1 Case Study – Domain Discretization Error for the Transitional Flow over a Sharp Cone 42 Temporal Discretization Errors .................................................................................................................... 42

3

Iterative Convergence Errors ......................................................................................................................... 44 Monitoring Convergence using Residual History .............................................................. 45 Monitoring Quantitative Convergence.............................................................................. 45 Norms of Convergence Error ............................................................................................. 46 Case Study – 2D Flow Over a Hill ....................................................................................... 47 Computer Round off Errors ............................................................................................................................ 48 Truncation Errors ............................................................................................................................................... 48 Code Errors...................................................................................................................................................... 49 Benchmarking & Inter-Code Issues ...................................................................................................... 49 Case Study 1 – Results of M6 Wing using NASA Codes of the Same Grid ......................... 51 Case Study 2 - Grid Convergence for 3D Benchmark Turbulent Flows ............................. 51 3.11.2.1 Subsonic Flow around a Hemisphere Cylinder ........................................................ 51 3.11.2.2 Geometry, Flow Parameters, and Boundary Conditions ......................................... 52 3.11.2.3 Results for Hemisphere Cylinder ............................................................................. 53 3.11.2.4 Forces and Pitching Moment .................................................................................. 53 3.11.2.5 Fine Grid Surface Pressure, Skin Friction, and off-body variation .......................... 55 3.11.2.6 Effect of Grid Refinement on Surface Pressure and Skin Friction ........................... 56 3.11.2.7 Transonic Flow Around an M6 Wing ....................................................................... 57 3.11.2.8 Geometry, Flow Parameters and Boundary Conditions.......................................... 57 3.11.2.9 Grids for M6 Wing ................................................................................................... 57 3.11.2.10 Results for M6 Wing ................................................................................................ 58 3.11.2.11 Concluding Remarks ................................................................................................ 60 Usage Errors.................................................................................................................................................... 62 What to trust and what not to? ............................................................................................................... 62 Verification and Validation for Computational Simulation ......................................................... 63

4

CFD in Biomedical Applications ........................................................................................... 65

Literature Survey in Biomedical CFD.......................................................................................................... 66 Cardiovascular Systems ..................................................................................................... 66 Respiratory Systems .......................................................................................................... 67 Merits and Limitations of Biomedical Applications in CFD ............................................................... 69 Hemodynamic Flow Modeling ....................................................................................................................... 71 Boundary Conditions ......................................................................................................................................... 71 Structural Deformation Models..................................................................................................................... 72 Fluid-Structure Interaction Techniques .................................................................................................... 72 Future of CFD in Biomedical Engineering ................................................................................................. 73 Case Study 1 – CFD Simulation of Human Carotid Artery Bifurcation based on Anatomy and Volumetric Blood Flow rate Measured with MRI ............................................................................................... 74 Approaches ........................................................................................................................ 74 Results and Discussion....................................................................................................... 75 Case Study 2 - CFD Analysis of the Effect of Plaques in the Left Coronary Artery ................... 76 Patient Data Selection for Generation of Left Coronary Artery Model ............................. 77 Realistic Plaques Modelling ............................................................................................... 78 Generation of Computational Models............................................................................... 78 Application of Physiological Parameters ........................................................................... 79 Performance of Computational Hemodynamic Analysis................................................... 79 CFD Results of the Left Coronary Artery............................................................................ 80 4.9.6.1 Cutting Plane Visualization ...................................................................................... 80

4

4.9.6.2 Wall Shear Stress (WSS) Comparisons .................................................................... 81 Discussion .......................................................................................................................... 82 Limitation ........................................................................................................................... 84

5

Mesh Free Methods for CFD ................................................................................................... 85

6

HVAC in Building and Related Issues .............................................................................. 105

Smooth Particle Hydrodynamics (SPH) ..................................................................................................... 85 Mesh free Local Petrov-Galerkin ....................................................................................... 86 Mesh free Methods Based on Radial Basis Functions ....................................................... 87 Finite Point Methods ......................................................................................................... 87 Meshless Boundary Schemes ............................................................................................ 88 Solution Procedure for Mesh free Methods ............................................................................................ 88 Domain representation ..................................................................................................... 88 Function Approximation .................................................................................................... 88 Formation of System Equations ........................................................................................ 89 Solving the Global Equations ............................................................................................. 89 Method of Smooth Particle Hydrodynamics (SPH) ............................................................................... 89 Formulation ....................................................................................................................... 89 Smoothing Kernels............................................................................................................. 90 Updating of Smoothing Length h....................................................................................... 92 5.3.3.1 Constant .................................................................................................................. 92 5.3.3.2 Variable.................................................................................................................... 92 Boundary Treatment ......................................................................................................... 92 Virtual Particles.................................................................................................................. 92 Ghost Particles ................................................................................................................... 93 Summery and Recap .......................................................................................................... 93 Case Study 1 - Lid Driven Cavity Problem .......................................................................... 93 Case Study 2 - Two-dimensional Convection–Diffusion Problem ..................................... 94 RKPM Method ....................................................................................................................................................... 94 Lagrangian Description of Fluid Dynamics Using SPH ........................................................................ 95 Default Kernel .................................................................................................................... 95 Numerical Time Integration............................................................................................... 97 5.5.2.1 The Implicit Euler Scheme ....................................................................................... 97 5.5.2.2 The Verlet Scheme .................................................................................................. 97 5.5.2.3 The Leap-Frog Scheme ............................................................................................ 97 Collision Handling .............................................................................................................. 98 Case Study 1 – Comparison of Weakly Compressible and Incompressible SPH ................ 98 5.5.4.1 Formulation of Problem .......................................................................................... 98 5.5.4.2 Results ..................................................................................................................... 99 Case Study 2 - Dam Break Water Flow using Lagrangian Description ............................... 99 Case Study 3 - Dam Break using MLPG-RBF and Shallow Water Equations .................... 100 Case Study 4 - SPH Method for Evaporating Multiphase Flows ...................................... 101 5.5.7.1 Basic Formulations of the SPH Method................................................................. 102 5.5.7.2 Evaporation of a Static Drop ................................................................................. 102 5.5.7.3 Evaporation of a Dynamic Drop Impacting on a Hot Surface ................................ 102 5.5.7.4 Concluding Remarks .............................................................................................. 103

5

Thermal Analysis in Buildings .................................................................................................................... 105 Ventilation Analysis ......................................................................................................... 106 6.1.1.1 Numerical Simulations of the Effect of Outdoor Pollutants on Indoor Air Quality of Buildings next to a street canyon .............................................................................................. 106 HVAC & Environmental Issues ................................................................................................................... 107 Introduction to Air-Conditioning Processes .................................................................... 108 The Role of CFD In HVAC System Optimization ............................................................... 109 6.2.2.1 Why Use CFD Analysis In HVAC Design ................................................................. 109 Case Study 1 - Aircraft Hangar Fire & Smoke Model ....................................................... 110 6.2.3.1 Results ................................................................................................................... 111 Case Study 2 - CFD Modeling Approach for HVAC Systems Analysis .............................. 111 6.2.4.1 Modeling and Simulation Approach ...................................................................... 111 6.2.4.2 Results and Discussion .......................................................................................... 112

7

CFD Applications in Other Areas ...................................................................................... 114

8

Modern Computer Architectures ..................................................................................... 125

Food Processing ................................................................................................................................................ 114 Drying............................................................................................................................... 114 Sterilization ...................................................................................................................... 115 Mixing .............................................................................................................................. 116 Refrigeration .................................................................................................................... 117 Crystallization .................................................................................................................. 117 Pasteurization .................................................................................................................. 117 Heat Exchangers ............................................................................................................................................... 118 CFD in Semiconductor Industry ................................................................................................................. 118 Brief Description of Semiconductor Devices ................................................................... 118 Thermal Management in Semiconductors ...................................................................... 119 Can You Really Fry an Egg on a CPU?............................................................................... 120 Magneto-Hydro-Dynamics (MHD) ............................................................................................................ 121 MHD Equations ................................................................................................................ 121 Case Study - Dynamics of a Q2D Wake Behind a Cylinder in presence of MHD Environment ................................................................................................................................... 122 7.4.2.1 Numerical Method and Geometry ........................................................................ 123 7.4.2.2 Result and Discussion ............................................................................................ 124

Background......................................................................................................................................................... 125 Memory Technology ....................................................................................................................................... 126 Memory Access Time....................................................................................................... 126 Memory Access Patterns ................................................................................................. 126 8.2.2.1 Loop Interchange to Ease Memory Access Patterns ............................................. 127 Virtual Memory ............................................................................................................... 127 Registers .............................................................................................................................................................. 128 Caches ................................................................................................................................................................... 128 Cache Organization.......................................................................................................... 130 8.4.1.1 Direct-Mapped Cache............................................................................................ 131 8.4.1.2 Fully Associative Cache .......................................................................................... 132 8.4.1.3 Set-Associative Cache ............................................................................................ 132

6

8.4.1.4 Instruction Cache................................................................................................... 133 Timing a Program ............................................................................................................................................ 133 Timing a Portion of the Program ..................................................................................... 135 Getting Time Information ................................................................................................ 135 Subroutine Profiling ........................................................................................................................................ 136 Loop Optimizations ......................................................................................................................................... 138 Operation Counting ......................................................................................................... 138 Basic Loop Un-Rolling ...................................................................................................... 140 Loops with Low Trip Counts ............................................................................................ 141 Fat Loops.......................................................................................................................... 141 Loops Containing Procedure Calls ................................................................................... 142 Loops with Branches ........................................................................................................ 142 Nested Loops ................................................................................................................... 142 Outer Loop Un-Rolling ..................................................................................................... 143 Loop Interchange to Move Computations to the Center ................................................ 144 Matrix Multiplication ...................................................................................................................................... 144 Matrix Optimization ........................................................................................................ 145 Blocking to Ease Memory Access Patterns ...................................................................... 146 Shared-Memory Parallel Processors ........................................................................................................ 146 Dependencies .................................................................................................................. 147 8.9.1.1 Control Dependencies ........................................................................................... 148 8.9.1.2 Data Dependencies ............................................................................................... 149 Forming a Flow Graph ..................................................................................................... 149 8.9.2.1 Loop Dependencies ............................................................................................. 150 8.9.2.2 Loop-Carried Dependencies .................................................................................. 151 8.9.2.3 Flow Dependencies ............................................................................................... 152 8.9.2.4 Output Dependencies ........................................................................................... 153 8.9.2.5 Dependencies Within an Iteration ........................................................................ 153 Pointer Ambiguity in C............................................................................................................................. 154

9

Parallel Processing and HPC .............................................................................................. 157 Classification of Parallel Computers Architecture.............................................................................. 157 Shared Memory Multi-Processor ............................................................................................................... 159 Distributed Memory Multi-Computer ..................................................................................................... 159 Efficiency and Scalability .............................................................................................................................. 160 Weak vs. Strong Scaling ................................................................................................... 162 Scalability vs. Performance .............................................................................................. 162 Load Balancing ................................................................................................................. 163 Performance of CFD Codes ........................................................................................................................... 163 CFD for Next Generation High Performance Computing ................................................ 164 Hardware Consideration and CPU vs. GPU Technology .................................................. 164 9.5.2.1 Case Study 1 – 2D Laplace Equation...................................................................... 165 9.5.2.2 Results ................................................................................................................... 165 9.5.2.3 8.5.3 Future Work – Heterogeneous Computing .................................................. 165 Case Study 2 - Unstructured Grid Based CFD Solvers on Modern Graphics Hardware ... 166 9.5.3.1 Background and Literature Survey ........................................................................ 166 9.5.3.2 Implementation on Graphics Hardware ................................................................ 167 9.5.3.3 Test Cases .............................................................................................................. 167

7

Software Consideration and Message Passing Interface (MPI) .................................................... 169 Cloud Computing: Definition and Features ........................................................................................... 170 High Performance Computing (HPC) ....................................................................................................... 170 Real Application Performance ......................................................................................... 172 Choosing the right interconnect ...................................................................................... 172 Grid Computing vs. HPC ................................................................................................................................ 172 HPC vs. HSC ................................................................................................................................................ 173 The Moral of the Story ............................................................................................................................. 173 HPC vs. Parallel Computing.................................................................................................................... 173 HPC vs. HTC ................................................................................................................................................ 174

10

CFD and HPC Trends Forecasted for 2030 .................................................................... 175

Comparison of Semiconductor fabrication sizes in HPC ........................................................... 175 Current Status of CFD ............................................................................................................................... 176 Conceptual Design ........................................................................................................... 176 Preliminary/Detailed Design ............................................................................................ 176 Product Validation and Certification ............................................................................... 177 CFD usage of High Performance Computing (HPC) ......................................................... 177 Turbulence Modeling....................................................................................................... 177 Process Automation ........................................................................................................ 178 Solution Uncertainty and Robustness ............................................................................. 178 Multidisciplinary Analysis and Optimization (MDAO) ..................................................... 179 Vision of CFD in 2030 as anticipated by NASA .............................................................................. 179 Technology Roadmap to achieve GC challenge ............................................................... 181 10.3.1.1 High Performance Computing (HPC) ..................................................................... 181 10.3.1.2 Physical Modeling .................................................................................................. 182 10.3.1.3 Numerical Algorithms ............................................................................................ 182 10.3.1.4 Uncertainty Quantification (UQ) ........................................................................... 184 10.3.1.5 Geometry and Grid Generation............................................................................. 184 10.3.1.6 Knowledge Extraction............................................................................................ 185 10.3.1.7 Multidisciplinary Design and Optimization ........................................................... 186 Recommendations ........................................................................................................... 186 HPC Envisioned by Department of Energy (DOE) ........................................................................ 188 What is Exascale Computing? .......................................................................................... 188 Why Exascale? ................................................................................................................. 188 Range of Applications may be Transformed by Going to the Exascale ........................... 188 10.4.3.1 Aerospace, Airframes and Jet Turbines ................................................................. 189 10.4.3.2 Combustion ........................................................................................................... 191 10.4.3.3 Climate Modeling .................................................................................................. 192 10.4.3.4 Computational Biology .......................................................................................... 192 10.4.3.5 Materials Science................................................................................................... 193 10.4.3.6 Nuclear Engineering .............................................................................................. 194 10.4.3.7 Others Disciplines .................................................................................................. 195 Challenges in Going to the Exascale ................................................................................ 195 10.4.4.1 The Hardware Challenges...................................................................................... 196 10.4.4.2 The Applied Mathematics Challenges ................................................................... 197 10.4.4.3 Mathematical Modeling ........................................................................................ 197 10.4.4.4 Numerical Algorithms ............................................................................................ 198

8

10.4.4.5 10.4.4.6 10.4.4.7

The Algorithmic Challenges ................................................................................... 199 Computer Science Challenges ............................................................................... 200 Educational Challenges.......................................................................................... 200

11

Artificial Intelligence in CFD .............................................................................................. 202

12

Appendix A ............................................................................................................................... 219

Machine Learning ...................................................................................................................................... 202 Difference Between Artificial Intelligence and Machine Learning .................................. 202 Deep Learning ............................................................................................................................................. 203 Data Mining ........................................................................................ Error! Bookmark not defined. Types of Problems and Tasks ............................................................................................................... 204 Supervised Learning......................................................................................................... 204 Unsupervised Learning .................................................................................................... 204 Reinforcement Learning .................................................................................................. 204 List of Common Machine Learning Algorithms ............................................................... 204 11.4.4.1 Linear Regression .................................................................................................. 205 11.4.4.2 Logistic Regression ................................................................................................ 205 11.4.4.3 Decision Tree ......................................................................................................... 206 11.4.4.4 Artificial Neural Networks (ANNs) ......................................................................... 206 11.4.4.5 Case Study - Prediction of the Maximal Wall Shear Stress (MWSS) Value for Carotid Artery Bifurcation ...................................................................................................................... 207 11.4.4.6 Model Explanation ..................................................... Error! Bookmark not defined. Machine Learning in Fluid Dynamics ................................................................................................ 208 Motivation and Objectives .............................................................................................. 208 Design and Optimization Issue ........................................................................................ 209 Accomplishments ............................................................................................................ 209 Field Inversion and Machine Learning in Support of Data Driven Environment ............. 210 11.5.4.1 Artificial Neural Networks (ANNs) ......................................................................... 210 The POD as Linear Artificial Neural Network (LANN) ...................................................... 211 11.5.5.1 POD and Nonlinear ANN........................................................................................ 212 Overview of ANNs in Turbulence Applications ................................................................ 213 The Future of ANNs for Fluids Modelling ........................................................................ 214 Some Preliminary Concepts in Quantum Computation ............................................................. 215 What is Quantum Computing? ........................................................................................ 215 How do Quantum Computers work and what they can do? ........................................... 215 Classical vs. Quantum Computing ................................................................................... 215 Qubits, and Power of a Quantum Computer................................................................... 216 Quantum Algorithms: Programming a Quantum Computer ........................................... 216 11.6.5.1 Could Quantum Computing Methods Improve Iterative Calculations in CFD?..... 217 11.6.5.2 Quantum Speedup for Turbulent Combustion Simulations .................................. 218 11.6.5.3 Large Eddy Simulation (LES) and Filtered Density Function (FDF) ........................ 218

Routine for Inverse Distance Weighted Interpolation (Shepard’s Method) ..................... 219

List of Tables Table 2.1 Table 3.1

Main Parameters of the test stages (P1) and (P2) ................................................................... 27 Discretization Error for 2D Burger’s Equation ......................................................................... 41

9

Table 3.2 NASA Code Comparisons for Surface Forces in M6 Wing ....................................................... 51 Table 3.3 Statistics of four finest grids for hemisphere cylinder grid families (Courtesy of [Diskin et al.]) .............................................................................................................................................................. 52 Table 3.4 Hemisphere Cylinder: Variation of Aerodynamic Coefficients on L1 Grids – (Courtesy of [Diskin et al.]) .............................................................................................................................................. 54 Table 3.5 Statistics of Grids for OM6 Wing Grid Families ....................................................................... 58 Table 3.6 Variations of Aerodynamic Coefficients - (Courtesy of [Diskin et al.]) .................................... 59 Table 8.1 Memory Access Speed on a DEC Alpha ................................................................................. 128 Table 10.1 Three Order of Magnitude Jump ......................................................................................... 196 Table 10.2 Potential Exascale Computer Design for 2018 and its relationship to current HPC designs (DOE) ......................................................................................................................................................... 196

List of Figures Figure 1.1 Active on-going Research Area in CFD ................................................................................... 13 Figure 1.2 Integrated Simulation for Nuclear Engineering...................................................................... 15 Figure 2.1 Interpolation on a matrix manifold ........................................................................................ 17 Figure 2.2 Comparison of the drag coefficient obtained with the High Fidelity (HF) and ROM simulations .................................................................................................................................................. 23 Figure 2.3 Comparison between velocity and pressure High Fidelity (HF)-ROM .................................... 24 Figure 2.4 1D vs 3D Analysis .................................................................................................................... 26 Figure 2.5 Turbine flow design process ................................................................................................... 28 Figure 2.6 Profile of Blades...................................................................................................................... 28 Figure 2.7 Typical Cooling System Network for Airflow Rate.................................................................. 29 Figure 3.1 Pressure Coefficient at 20% Chord Length using Different Turbulence Model ..................... 33 Figure 3.2 Effects of Different Turbulence Models in a Steep Obstacle ................................................. 34 Figure 3.3 Inviscid stencil with 1st order cells in red and 2nd order cells in green ................................... 35 Figure 3.4 Viscous stencil with viscous cells in blue and 2nd order cells in green ................................... 35 Figure 3.5 Effect of 1st and 2nd Order Differencing Scheme in Error ....................................................... 36 Figure 3.6 Effect of Pe Number in balancing Diffusive and Convective Flows ........................................ 36 Figure 3.7 Effects of mesh density on solution domain .......................................................................... 38 Figure 3.8 Domain Topology (O-Type, C-Type, and H-Type; from left to right) ...................................... 39 Figure 3.9 Contours of Total Estimated Discretization Error in Density ................................................. 40 Figure 3.10 Exact error, Estimated error scheme for viscous Burgers’ equation (Courtesy of Yan and Ollivier-Gooch) ............................................................................................................................................ 42 Figure 3.11 Relative discretization error for the transitional flow over a sharp cone ............................ 43 Figure 3.12 Temporal Discretization Criteria .......................................................................................... 44 Figure 3.13 Effect of CFL Number on Convergence of 1D Wave Equation ............................................. 45 Figure 3.14 Estimated Iteration Error of U1 for Different Level of Tolerance criteria et ......................... 47 Figure 3.15 Global View of and Boundary Conditions (Courtesy of [Diskin et al.])................................. 52 Figure 3.16 Global View of Hemisphere Cylinder Pressure Contours using L1 grid at surfaces y = 0 (left) and x = 6 (right); (Courtesy of [Diskin et al.]) .............................................................................................. 53 Figure 3.17 Grid Convergence of Aerodynamic Forces for Hemisphere Cylinder (Courtesy of [Diskin et al.]) .............................................................................................................................................................. 55 Figure 3.18 Global View of Surface Pressure and Skin Friction at symmetry plane (y = 0) for Hemisphere Cylinder – (Courtesy of [Diskin et al.]) .................................................................................... 56 Figure 3.19 M6 wing: pressure contours computed by USM3D on family 4 prism/hex L1 grid (Courtesy of [Diskin et al.]) ......................................................................................................................... 58 Figure 3.20 M6 Grid Convergence of Aerodynamic Forces CL, CD ........................................................... 59

10

Figure 3.21 M6 Grid Convergence of Pitching Moment ......................................................................... 60 Figure 3.22 M6 section 1 (η = x/c = 0.2) View of leeside Pressure Grid Refinement - (Courtesy of [Diskin et al.]) .............................................................................................................................................. 61 Figure 4.1 Example of CFD simulations in cardiovascular and respiratory systems ............................... 65 Figure 4.2 CFD Model Construction for Biomedical Application ............................................................. 70 Figure 4.3 Axial velocity and Time Average............................................................................................. 75 Figure 4.4 Anatomic Model for the Patient with Carotid Artery Plaque................................................ 76 Figure 4.5 3D CT visualization of a normal left coronary artery with coronary artery disease .............. 77 Figure 4.6 Plaque distribution in left coronary artery Model ................................................................. 78 Figure 4.7 The EPL Posterior View at left Coronary Artery ..................................................................... 80 Figure 4.8 Flow velocity observed in pre and post plaque simulated models ........................................ 81 Figure 4.9 Cross-sectional views of A–E at the left main stem ............................................................... 82 Figure 4.10 Comparison of WSS between non-Newtonian and Newtonian Models Observed in Coronary Artery with Presence of Plaques ................................................................................................. 83 Figure 5.1 Domain representation .......................................................................................................... 88 Figure 5.2 Different type of Support domains ........................................................................................ 89 Figure 5.3 1-D SPH Characterization ....................................................................................................... 89 Figure 5.4 The choice of Different Smooth Kernel in 1D (h=1) ............................................................... 91 Figure 5.5 Ghost Particles, Velocities are formed Symmetrically (slip wall) ........................................... 92 Figure 5.6 Virtual Particles ...................................................................................................................... 92 Figure 5.7 Example of a 1D task, particle j is Situated in the Near Boundary Area ................................ 92 Figure 5.8 Comparison with FDM with SPH for Lid Driven Cavity ........................................................... 93 Figure 5.9 The diagram of global domain Ω, local support domain Ωs of point xs, global points x and local point xi ................................................................................................................................................ 94 Figure 5.10 Lagrange particle-based fluid structure in 2D ...................................................................... 95 Figure 5.11 The default kernel and its derivatives in one dimension for h=1......................................... 96 Figure 5.12 The leap-frog mechanism ..................................................................................................... 97 Figure 5.13 Comparison of ISPH (upper), FEM (middle) and WCSPH (lower) velocity contours for the angle of attack of 15 degrees at Re = 570 (Courtesy of Shadloo105) .......................................................... 99 Figure 5.14 Dam-Break Flow of water ................................................................................................... 100 Figure 5.15 Geometry and Water surface profile of the 2D dam-break problem at t =7.2 s. .............. 100 Figure 5.16 Snapshots of the Evaporating Drop at different times using SPH...................................... 102 Figure 5.17 Evolution of Dynamic Drop impact on a hot surface using SPH......................................... 103 Figure 6.1 Impact of Window Opening Percentage (WOP) on indoor air quality ................................. 107 Figure 6.2 Effect of window loading in outdoor pollutants .................................................................. 108 Figure 6.3 Pollution between two buildings separated by a street ...................................................... 108 Figure 6.4 Study for HVAC design of an aircraft hangar ........................................................................ 111 Figure 6.5 Building Schematic with internal configuration ................................................................... 112 Figure 6.6 Post Processing of Results .................................................................................................... 113 Figure 7.1 Illustrates the various classes of conductors........................................................................ 119 Figure 7.2 Modern Semiconductor ....................................................................................................... 119 Figure 7.3 Thermal Management of Semiconductor (courtesy of Mentor CFD) .................................. 120 Figure 7.4 An Example of an Egg Frying on a CPU ................................................................................. 120 Figure 7.5 Right Hand Rule for MHD ..................................................................................................... 122 Figure 7.6 Schematic diagram of numerical domain............................................................................. 123 Figure 7.7 Contour plots of vorticity snapshot at Red = 160 and at Hartmann number as indicated .. 124 Figure 8.1 Contributions from other disciplines to CFD ........................................................................ 125 Figure 8.2 Cache Lines can come from Different Parts of Memory ...................................................... 129

11

Figure 8.3 Many memory addresses map to the same cache line ........................................................ 131 Figure 8.4 Two-Way Set-Associative Cache .......................................................................................... 133 Figure 8.5 Sharp Profiling (right) vs. Flat Profiling (right) ...................................................................... 137 Figure 8.6 (a) Control Dependency; (b) A section of your program; (c) Expensive Operation Moved so that it's Rarely Executed ........................................................................................................................... 148 Figure 8.7 Types of Data Dependencies ................................................................................................ 149 Figure 8.8 Flow Graph for Data Flow Analysis....................................................................................... 150 Figure 8.9 Flow Graph including a loop ................................................................................................ 151 Figure 9.1 Multi-Processor vs. Multi-Computer .................................................................................... 158 Figure 9.2 Shared Memory Multi-Processor ......................................................................................... 159 Figure 9.3 Distributed Memory Multi-Processor .................................................................................. 160 Figure 9.4 Amdahl's Law........................................................................................................................ 161 Figure 9.5 Example of Strong Scalability ............................................................................................... 162 Figure 9.6 Architecture differences between CPU and GPU ................................................................. 164 Figure 9.7 Results for V-Cycle Multigrid ................................................................................................ 165 Figure 9.8 Heterogeneous Computing using CPUs and GPUs ............................................................... 165 Figure 9.9 Pressures at the Surface and Plane for the NACA 00012 (Left) and at the Surface for the Missile (Right) ........................................................................................................................................... 167 Figure 9.10 Running Times in double Precision Per Element Per Iteration for the NACA0012 (top) and Missile (bottom) ........................................................................................................................................ 168 Figure 9.11 Maui High Performance Computing Center ....................................................................... 171 Figure 9.12 Performance rate of two HPC for benchmark CFD Analysis .............................................. 172 Figure 9.13 Scope of HPC and HSC ........................................................................................................ 173 Figure 10.1 Changing Predictions About Semiconductor Sizes ............................................................. 175 Figure 10.2 Proposed New Computational Sciences Program Structure .............................................. 187 Figure 10.3 Computer speed and memory requirements for the Grand Challenge ............................. 189 Figure 10.4 A supersonic Jet Engine Nozzle Rapidly Accelerates High-Pressure Gas into the Atmosphere .............................................................................................................................................. 190 Figure 10.5 Detail View of 9-Billion Atom Molecular Dynamics Simulation Instability ........................ 194 Figure 11.1 Schematics of AI, Machine Learning and Deep Learning .................................................. 203 Figure 11.2 Linear Regression ............................................................................................................... 205 Figure 11.3 Decision Tree ...................................................................................................................... 206 Figure 11.4 Artificial Neutral Network (ANN)........................................................................................ 207 Figure 11.5 Neutral Networks ............................................................................................................... 208 Figure 11.6 Calibration Cases for off line data ...................................................................................... 210 Figure 11.7 Network diagram for a feed-forward NN with three inputs and one output ................... 211 Figure 11.8 Comparison of linear POD (top) and Neural Networks (bottom) ...................................... 213 Figure 11.9 Skin Friction Coefficient for Onera M6 match to within 2% ............................................. 213 Figure 11.10 Difference Processing Between Classical and Quantum Computer ................................ 216 Figure 11.11 The Bloch Sphere is a Representation of a qubit, the fundamental building block of quantum computers ................................................................................................................................. 216

12

13

1 Introduction As evident in Figure 1.1 below, there is no shortage of active research area in CFD. Besides the regular on-going research in new algorithms, there are ever expanding of new activities, some mentioned here but not all. Some of the more prominent researches are shown in

Advanced Algorithms in CFD

Meshfree methods Aero elastic

Micro/Nano Fluids

HPC

Active Research in CFD

Biomedical applications

Combustion

Integrated Simulations

Turbulence

Computational Predicitvity

Figure 1.1

MultiPhase flows

Aeroacoustic & Noise

Active on-going Research Area in CFD

Figure 1.1 and out of them, some defined below.

Computational Predictively plus Verification & Validation This includes relatively well-defined tasks such as verification of the correctness of computer codes and uncertainty quantification as well as more hazy ones like validations of the model being used. As

14

codes become more complex their verification becomes more challenging. Methods such as the Method of Manufactured Solutions are one way. In its simplest form, uncertainty quantification is simply the propagation of uncertainties in parameters, properties and models to the final solution. Although conceptually simple, this is a formidable task both because we need to know all elementary uncertainties and because of the number of computations involved. Other avenues are obtaining the Sensitivity Analysis of solution with respect to design variables of interest. This can be achieved with attaining the 1st order differentials which indicates the max/min of function. Statistical variations certainly has proved its value in many areas, such as quality control in manufacturing and uncertainty quantification is likely to become increasingly more important in the use of simulations in design.

Multiscale/Multiphysics: Multiscale is a broad term that usually means what the user intends it to. In most cases, however, it is used to mean phenomenon where some aspects of the physics that we wish to compute must be described by a different physical model. This can include contact lines in multiphase flow simulations represented by molecular or phase field models, reaction zones, shocks in rarified gases and so on. While we often think of multiscale representing different physical processes, such as continuum and non-continuum descriptions, it also applies to the same physics but modeled in different ways, such as when small drops are modeled as point particles. Numerical challenges include how to blend one description with another.

Mesh Free Methods for CFD While the generation of meshes has always posed challenges for computational scientists, the problem has become more acute in recent years. While algorithms have seen great advances, mesh generation has lagged behind, creating a computational bottleneck. For industry and government looking to impact current and future products with simulation technology, mesh generation imposes great challenges. Many generation procedures often lack automation, requiring many man-hours, which are becoming far more expensive than computer hardware. More automated methods are less reliable for complex geometry with sharp corners, concavity, or otherwise complex features. Most mesh generation methods to date require a great deal of user expertise to obtain accurate simulation results. Since the application of computational methods to real world problems appears to be paced by mesh generation, alleviating this bottleneck potentially impacts an enormous field of problems1. Meshless methods applied to computational fluid dynamics is a relatively new area of research designed to help alleviate the burden of mesh generation. Despite their recent beginning, there exists no shortage of formulations and algorithms for meshless schemes in the literature. A brief survey of the field reveals varied approaches arising from diverse mathematical backgrounds applied to a wide variety of applications. All meshless schemes attempt to bypass the use of a conventional mesh entirely or in part by discretizing governing partial differential equations on scattered clouds of points or collection of smooth blob of particles. There are two different approaches which are called meshless. One contains methods like surface panel methods, boundary element methods, etc. which do not contain a volume grid. The other types are those which use an arbitrary distribution of points in the computational domain. Particle methods also belong to this category where the particles themselves act as discretization points. The method is called meshless because the points need not form any grid and they do not have to be arranged in any particular manner. The main motivation of meshless methods is that it is much easier to generate a point mesh. The accuracy of grid-based methods depends on the quality of the grid and

1

Aaron Jon Katz, ResearchGate,”Meshless methods for computational fluid dynamics”, January 2009.

15

so you have to ensure orthogonality, or make sure that elements are not highly skewed, while meshless methods are not very much affected by how the points are distributed2. A brief survey of the field reveals varied approaches arising from diverse mathematical backgrounds applied to a wide variety of applications. Sorting and classifying the many meshless methods is no simple task. To add to the confusion, meshless schemes fall under many other names including mesh free, grid free, grid less, generalized finite difference, and Smooth Particle Hydrodynamics (SPH). We try to adapt the mesh free vocabulary here. From the above methods, smooth particle hydrodynamics (SPH) is distinctive in mesh free methods. It is where the fluid mass is lumped into smoothed blobs that are moved using Newton’s second law directly, without an underlying mesh. In SPH the fluid is modeled as a collection of smooth “blobs” or particles3.

Integrated Simulations of Complex Systems Engineers have long desired to have computational models that describe systems consisting of many coupled components. At the simplest level such simulators model the dynamics of connected rigid bodies, lumped models of chemical and power plants and so on. As computers become more powerful we are seeing growing efforts to attempt much more complex modeling, such as of rockets (the Illinois ASCI center) or a nuclear power plant (CASL), and other DOE research hub funded efforts. Other examples include the Human Body Simulator Project in Japan (lead by S. Takagi) and possibly the recently announced Living Earth Simulator proposal by D. Helbing. Overall there is very limited theoretical basis for how to do the coupling (with some exceptions such as for solid/fluid problems) and that there is considerable room for significant progress. As the ASCI programs, CASL and other effort suggest, this is going to be a very significant area in the future. (See Figure 1.2).

Figure 1.2

2 3

Integrated Simulation for Nuclear Engineering

From CFD Online Forum. Grétar Tryggvason, “Smooth Particle Hydrodynamics”, Lecture Series 2013.

16

17

2 Reduced Order Modeling (ROM) Many modern mathematical models of real-life processes pose challenges when used in numerical simulations, due to complexity and large size (dimension). Model order reduction aims to lower the computational complexity of such problems, for example, in simulations of large-scale dynamical systems and control systems. By a reduction of the model's associated state space dimension or degrees of freedom, an approximation to the original model is computed. This Reduced Order Model (ROM) can then be evaluated with lower accuracy but in significantly less time. Reduced order models (ROM) can be thought of as computationally inexpensive mathematical representations that offer the potential for near real-time analysis. While most ROMs can operate in near real-time, their construction can however be computationally expensive as it requires accumulating a large number of system responses to input excitations. Furthermore, ROM usually lack robustness with respect to parameter changes and therefore must often be rebuilt for each parameter variation. Together, these two issues underline the need for a fast and robust method for adapting pre-computed ROMs to new sets of physical or modeling parameters. However, ROMs and their corresponding Reduced Order Bases (ROB) are quantities that typically belong to nonlinear, matrix manifolds. As such, classical interpolation methods fail, as they are not able to enforce the constraints characterizing Figure 2.1 Interpolation on a matrix manifold those manifolds. The first part of the project consists of designing a suitable interpolation method enforcing those constraints. A schematic representation of the algorithm is shown in Figure 2.1. It relies on identifying the correct manifold for the given application, constructing the appropriate logarithm mapping to move the interpolation data to a tangent space to this manifold where a standard multivariate interpolation algorithm can be applied, and constructing the appropriate exponential mapping to bring back the computed result to the manifold of interest4. The purpose of reduced order models (ROMs) is:     

taking advantage of redundancies identifying ‘genuine’ degrees of freedom giving low dimensional approximations (few modes) preserving a satisfactory accuracy decreasing the computational resources (time & storage)

Reduced Order Models (ROMs) based on statically non-linear flow solutions, but with a dynamically time linear approach have been developed. Thus unsteady flows that are a small perturbation about a steady flow with shocks and separations are modelled. This makes ROMs ideal for applications such as flutter clearance and aero-servo-elasticity. To generate a ROM about a particular non-linear mean solution, the dynamically time linear response must be extracted from the CFD code. 4

Farhat Research group.

18

Various Techniques There is a large variety of ROMs in the market. They are also known as surrogate models. A common approach for model order reduction is Projection-Based reduction. The following methods fall into this class:           

The classic Proper Orthogonal Decomposition (POD) with Galerkin projection. Either singular value decomposition (SVD) or high-order singular value decomposition (HOSVD), possibly combined with interpolation. Reduced Basis Method. Balanced Truncation. Approximate Balancing. Matrix Interpolation. Transfer Function Interpolation. Piecewise Tangential Interpolation. Loewner Framework. Empirical (Cross Gramian)5. Krylov Subspace Methods.

Among those, the application of the POD-Galerkin reduced order modelling for Finite Volume discretization technique is gained more industrial fields acceptance.

Common Features Shared by Reduced Order Methods (ROM) All reduced bases require the solution of high-fidelity and therefore very expensive discrete state and/or sensitivity equations and/or adjoin equations. The idea is that these expensive calculations can be done off-line before a state simulation or the optimization of the design parameters or feedback control is attempted. Moreover, one hopes that a single reduced basis can be used for several state simulations or in several design or control settings6. All reduced-basis sets are global in nature, i.e., the support the basis functions globally. Therefore, solving the state or sensitivity or adjoin equations with respect to any of the reduced bases requires the solution of dense linear and nonlinear systems. Thus, unless the dimension of a reduced basis is “small,” it cannot be used without some further processing. Unfortunately, in order to obtain meaningful approximations, it is often the case that the use of reduced bases requires the use of a relatively large number of basis functions. However, it is often the case that reduced bases contain “redundant” information in the sense that the dynamics of the state should be well approximated by a set of functions of much lower dimension. The question then arises: how can one extract a reduced basis of smaller dimension that contains all the essential information of a reduced basis of larger dimension? This is where Proper Orthogonal Decompositions (POD) and Cantorial Voronoi Tessellations (CVT) come in and, in this sense, they are reduced-reduced basis methods. Unfortunately, there is no adequate theoretical foundation for reduced-order methods, even in state simulation settings. However, it is certain that without an inexpensive method for reducing the cost of state computations, it is unlikely that the solution of 3D optimization and control problems involving complex systems, e.g., the Naiver-Stokes system, will In control theory, the cross Gramian is a Gramian matrix used to determine how controllable and observable a linear system is. 6 John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 5

19

become routine anytime soon. Thus, it is also certainly true that these methods deserve more study from the computational and theoretical points of view.

Reduced Basis Methods All reduced-order methods are reduced basis methods. However, there is a class of methods that use Lagrange bases, Hermit bases, Taylor bases, and Snapshot bases (or more precisely, snapshot sets) that have come to be known as Reduced-Basis Methods. Lagrange Lagrange bases consist of state solutions corresponding to several different values of the parameters (Reynolds number, design parameters, etc.) These solutions are obtained by standard (and expensive) techniques such as finite element or finite volume methods. For example, if one has the design parameters {αj}j=1, J, one obtains an approximate state solutions for n sets of parameter values to form the n-dimensional Lagrange reduced basis7. Hermit Hermit bases consist of the state variables and the first derivatives of the state variables with respect to parameters (the sensitivities) determined for different values of the parameters. The state and sensitivity approximations are obtained through standard (and expensive) techniques such as finite element or finite volume methods. Thus, again, if one has the design parameters {αj}j =1, J, one chooses M sets of parameter values and then one obtains the corresponding M approximate state solutions and the corresponding MJ sensitivity derivative approximations. The n = M (J + 1) state and sensitivity approximations form the Hermit reduced basis of dimension n. Taylor Taylor bases consist of the state and derivatives of the state with respect to parameters (sensitivities and higher-order sensitivities) determined for a fixed set of design parameters. The state and derivative approximations are obtained through standard (and expensive) techniques such as finite element or finite volume methods. The Taylor basis may be somewhat complicated to program due to the complexity of the partial differential equations that determine the higher-order sensitivities. In addition, the number of higher-order derivatives grows very rapidly with the number of design parameters, e.g., if one has 10 design parameters, there are 55 different second derivative sensitivities. Thus, the dimension of the Taylor reduced basis grows quickly with the number of parameters and the number of derivatives used. Snapshot Sets The state of a complex system is determined by parameters that appear in the specification of a mathematical model for the system. Of course, the state of a complex system also depends on the independent variables appearing in the model. Snapshot sets consist of state solutions corresponding to several parameter values and/or evaluated at several values of one or more of the dependent variables. For example, steady-state solutions corresponding to several sets of design parameters or a time-dependent state solution for a fixed set of design parameter values evaluated at several time John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 7

20

instants during the evolution process. Or several state solutions corresponding to different sets of parameter values evaluated at several time instants during the evolution process. Snapshot sets are often determined by solving the full, very large-dimensional discretized system obtained via finite volume or finite element discretization. Experimental data have also been used to determine a snapshot set. Snapshot sets often contain “redundant” information; therefore, snapshot sets must usually be postprocessed to remove as much of the redundancy as possible before they can be used for reduced-order modeling. POD and CVT may be viewed as simply different ways to post-process snapshot sets. Since snapshot sets are the underpinning for POD and CVT, we briefly discuss how they are generated in practice. At this time, the generation of snapshot sets is an art and not a science; in fact, it is a rather primitive art. The generation of snapshot sets is an exercise in the design of experiments, e.g., for stationary systems, how does one choose the sets of parameters at which the state (and sensitivities) are to be calculated (using expensive, high-fidelity computations) in order to generate the snapshot set? Clearly, some a priori knowledge about the types of states to be simulated or optimized using the reduced-order model is very useful in this regard. The large body of statistics literature on the design of experiments has not been used in a systematic manner. For time-dependent systems, many (ad hoc) measures have been invoked in the hope that they will lead to good snapshot sets. Timedependent parameters (e.g., in boundary conditions) are used to generate states that are “rich” in transients, even if the state of interest depends only on time-independent parameters. In order to generate even “richer” dynamics, impulsive forcing is commonly used, e.g., starting the evolution impulsively with different strength impulses and introducing impulses in the middle of a simulation. In the future, a great deal of effort needs to be directed towards developing and justifying methodologies for generating good snapshot sets8. After all, a POD or CVT basis is only as good as the snapshot set used to generate it.

Proper Orthogonal Decomposition (POD) Spaces In order to create a reduced basis space onto which the governing equations are projected, one can find many techniques in literature such as the Proper Orthogonal Decomposition (POD), Proper Generalized Decomposition (PGD), as well as Reduced Basis (RB) method with a greedy approach. The POD approach is been selected here. The POD consists into the decomposition of the flow fields into temporal coefficients ai(t) and orthonormal spatial bases φi(x): Ns

u(x, t)   a i (t) i (x) i 1

Eq. 2.1

where φi(x) are orthonormal spatial bases that minimizes the average of the error between the snapshots and their orthogonal projection onto the bases and Ns is the number of considered snapshots. The POD space VPOD = span(φ1, φ2, , , , φNs) is then constructed solving the following minimization problem:

John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 8

21

VPOD

NS 1 Ns  arg min (u n (x), i (x)) L2 (  ) i (x)  u n ( x) -  NS n 1 n 1

where

2

L2 (  )

Eq. 2.2

(i (x),  j (x))  δij

where un is a general snapshot of the velocity field at time t = tn. The snapshot can be numerical solutions of the NSEs (typical from LES and DNS simulations or even by the RANS equations) or they are obtained from experimental results. The POD basis minimizes the difference between the snapshots and the projection of the snapshots on the spatial modes in the X-norm, given the orthonormality of the modes. If the L2-norm is chosen, the POD basis is optimal considering the energy contained in the snapshots. Following development in9. It can be shown that this problem can be solved computing a singular value decomposition of the so called snapshots matrix. Galerkin Projection into POD Space In this section the Galerkin projection of the governing equations onto the POD space is highlighted and discussed. The idea here is to consider both the momentum conservation and continuity equation. In order to be consistent with the full order solver, the same set of equations are considered, namely the momentum conservation and the Poisson equation for pressure. Case Study - Vortex shedding around a circular cylinder using a POD-Galerkin Method Vortex shedding around circular cylinders is a well-known and studied phenomenon that appears in many engineering fields. In this work a Reduced Order Model (ROM) of the incompressible flow around a circular cylinder, built performing a Galerkin projection of the governing equations onto a lower dimensional space is presented. The reduced basis space is generated using a Proper Orthogonal Decomposition (POD) approach. In particular the focus is into:  The correct reproduction of the pressure field, that in case of the vortex shedding phenomenon, is of primary importance for the calculation of the drag and lift coefficients;  For this purpose the projection of the Governing equations (momentum equation and Poisson equation for pressure) is performed onto different reduced basis space for velocity and pressure, respectively;  All the relevant modifications necessary to adapt standard finite element POD-Galerkin methods to a finite volume framework are presented. The accuracy of the reduced order model is assessed against full order results. 2.4.2.1 Governing Equations For the moment, we consider the incompressible Navier–Stokes equations without any turbulence treatment as

.u  0

,

u t  (u.)u - νΔu  p  0

Eq. 2.3

where u is the velocity, p is a normalized pressure and υ is the kinematic viscosity. The equations are given in a domain Ω with proper boundary and initial conditions. The Finite Volume method is a discretization method based on a “balance” approach, well suited for the solution of equations based on conservation laws. A local balance, obtained from the discretization of the integral form of the Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, “Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communications in Applied and Industrial Mathematics ISSN 2038-0909, 2017. 9

22

governing equations, is written on each discretization cell. As for details, readers should consult 1011. This approach can be interpreted as if the state vector of the variables of interest was expanded as linear combination of state vector spatial modes:

 u(x, t)   u r (x, t)  Nr   i ( x)         F(x, t)    Fr (x, t)    a i (t) ψi (x)   p(x, t)   p (x, t)  i 1  χ ( x)     r   i 

Eq. 2.4

Replacing the velocity u with ur and p with pr in Eq. 2.1, employing the approximated face flux Fr in the convective term, and applying the Galerkin projection. The reduced order model of the momentum equation is obtained performing an L2 orthogonal projection onto the reduced bases space VPOD spanned by the POD velocity modes with a procedure similar to what presented in Eq. 2.2.

(i , u t  (u.)u - νΔu  p)L2 ( )  0 Eq. 2.5

With respect to what presented in Eq. 2.2 here also the gradient of pressure is considered inside the momentum equation. It is assumed that velocity and pressure modes share the same temporal coefficients. Substituting the POD approximations of u, F and p into Eq. 2.5 and exploiting the orthogonality of the POD modes, one obtains the following dynamical system of Ordinary Differential Equations (ODEs). The following POD-Galerkin ROM for Finite Volume discretization (POD-FV-ROM) is obtained as:

da j (t)

Nr

Nr Nr

Nr

 ν  B ji a i (t)   C jkia k (t)ai (t)   A ji a i (t) dt i 1 k 1 i 1 i 1 B ji  ( j , Δi ) L2 , C jki  ( j , .(ψ k , i )) L2 , A ji  ( j , χ i ) L2

Eq. 2.6

2.4.2.2 Details of the Full Order Simulation The convective term is discretized in space making use of the Gauss's theorem). The face center values of the variables are obtained from the center cell ones, which are the numerical problem unknowns, with an interpolation scheme consisting into a combination of a linear and upwind scheme. The diffusive term is discretized in a similar fashion. In this case though, a central differencing interpolation scheme with non-orthogonality correction is preferred. Also the pressure gradient is discretized making use of Gauss's theorem. Here, the face center pressure values are obtained from the cell center ones by means of a linear interpolation scheme, in which a limiting operation on the gradient is performed so as to preserve the monotonicity condition and ensure that the extrapolated face value is bounded by the neighboring cell values. As for the time discretization, a backward Euler scheme is used. The overall time extent of the simulation is equal to T = 3645s, which is sufficiently long to reach a perfectly periodic response of the lift and drag forces. The Stefano Lorenzi, Antonio Cammi, Lelio Luzzi, Gianluigi Rozza, “POD-Galerkin method for finite volume approximation of Navier–Stokes and RANS equations”, Comput. Methods Appl. Mech. Engr. 311 (2016) 151–179. 11 Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, ”Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communication Appl. Ind. Math. 9 (1), 2017, 1–21. 10

23

simulation is run in parallel on 4 Intel R CoreTM processors, taking TCPU-HF = 1483s ≈25min to be completed. 2.4.2.3 Details of the ROM Simulation The ROM is constructed using the methodologies described in x 3. For the generation of the POD spaces, we considered 120 snapshots of the velocity, mass flux and pressure fields. The snapshots are collected in a time window covering approximately 1.5 periods of the vortex shedding phenomenon. More precisely, the last 73s of the HF simulation are used. The first two modes for velocity and pressure field respectively are presented in Figure 3 and 4. The ROM simulations are carried out using different values of the POD velocity space dimension Nu = 3; 5; 7; 10. The dimension of the POD pressure and mass ux space is set equal to the dimension of the velocity POD space Nu = Np. The ROM simulation is run in serial, on the same processor used for the HF simulation. In this case, the time advancing of the ROM problem is carried out using the Matlab ODE suite. Reproducing the full 3645s extent of the high fidelity (HF) simulation requires, using the ROM model with the highest dimension of the POD space, approximately TCPU-ROM = 9.10s. This corresponds to a speedup SU≈ 650. 2.4.2.4 Analysis of the Results Using the settings described in the previous paragraph, four different ROM simulations are run, each featuring a different value of the POD space dimension. The results are compared with those of the High Fidelity (HF) simulation in terms of history of the lift and drag coefficients. The time window used for the comparison is the same window used for the collection of the snapshots. The lift coefficient comparison is reported in12, while the drag coefficient time histories is presented in

Figure 2.2

Comparison of the drag coefficient obtained with the High Fidelity (HF) and ROM simulations

Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, ”Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communication Appl. Ind. Math. 9 (1), 2017, 1–21. 12

24

Figure 2.2. Figure 2.3 the comparison is shown directly on the velocity and the pressure fields. In this case, the time step considered is the last one of the simulations corresponding to T = 3645s. The left plot in Figure 2.3 refer to the velocity (top) and pressure (bottom) fields computed with the high fidelity simulations. The right plots refer to the velocity (top) and pressure (bottom) fields computed with the ROM, in which the POD space dimension has be set to Nu = 10. The plots show that, at a glance the HF and ROM solutions cannot be distinguished.

Figure 2.3

Comparison between velocity and pressure High Fidelity (HF)-ROM

Addressing Challenges in Reduced-Order Modeling

One of applied mathematics’ great contributions is the foundation it provides for simulating physical phenomena. From the derivation of consistent, stable, and convergent discretization schemes to the development of efficient parallel solvers, mathematical advances have enabled the ubiquitous nature of modeling and simulation in applications ranging from protein-structure prediction to aircraft design. Today, the predictive capability of validated computational models allows simulation to replace physical experimentation in many scenarios, which facilitates the realization of deeper analyses and better designs at lower costs. However, there is a catch: the resolution required to achieve such high fidelity leads to large-scale models whose simulations can consume weeks on a supercomputer. This creates a massive gap between the simulation times of high-fidelity models and the rapid time-to-solution demands of time-critical (e.g., real-time analysis) and many-query (e.g., uncertainty quantification) applications in engineering and science13. 13

Kevin Carlberg, ”Addressing Challenges in Reduced-Order Modeling”, SIAM News March 2016.

25

To bridge this gap, researchers have pursued reduced-order modeling which integrates techniques from data science, modeling, and simulation as a strategy for reducing the computational cost of such models while preserving high levels of fidelity. First, these methods execute analyses (e.g., simulating the model, solving Lyapunov equations) during an off line ‘training’ stage; these analyses generate data that can be mined to extract important physical features, such as low-dimensional solution manifolds and interpolation points for approximating nonlinear functions. Next, these techniques reduce the dimensionality and computational complexity of the high-fidelity model by projecting the governing equations onto the low-dimensional manifold and introducing other approximations where necessary. The resulting reduced-order model (ROM) can then be rapidly simulated during an online ‘deployed’ stage. While significant advances have been made in reduced-order modeling over the past fifteen years, many outstanding challenges face the community, especially with respect to applying model reduction to parameterized nonlinear dynamical systems. To address this, One workshop theme focused on applying ROMs to truly large-scale nonlinear problems in engineering and science. To motivate this, an invited speaker provided a number of compelling examples in which the computational cost incurred by such models poses a major bottleneck to design engineers across the naval, aerospace, and automotive industries. A number of challenges arise in this case. First, ROM techniques must be tightly integrated with the original high fidelity simulation code because most nonlinear ROM methods realize computational savings by performing computations with the high-fidelity model on a small subset of the computational domain. Second, ensuring accurate ROM solutions can be challenging due to the complex dynamics (e.g., stiffness) exhibited by many large-scale dynamical systems. Finally, when the model is very large scale, the computational costs of both the offline training and online deployment can remain prohibitive; devising ways to reduce them is often essential. A second major workshop theme focused on applying ROMs to design optimization. These manyquery problems which are often formulated as mathematical optimization problems constrained by partial differential equations can require hundreds of simulations (and sensitivity analyses) of the computational model. Thus, rapid model evaluations are necessary when faced with time or resource constraints. Louis Durlofsky14 proposed a related method based on the Trajectory Piece Wise Linear (TPWL) ROM, and showed promising results on oil-production optimization under water injection. Despite the many challenges, model reduction remains an exciting research area that is making rapid progress toward bridging the gap between high-fidelity models and time-critical applications in engineering and science.

Reduced Order CFD Simulation The unsteady Euler and Navier-Stokes solutions have thousands of degrees of freedom. This means that the costs of unsteady flow studies are prohibitive. Schemes that retain the accuracy of the full non-linear methods, but at a reduced cost will make such studies feasible. This is the rationale for Reduced Order Models (ROM) which is based on statically non-linear flow solutions, but with a dynamically time linear approach have been developed. Thus unsteady flows that are a small perturbation about a steady flow with shocks and separations are modelled. This makes ROMs ideal for applications such as flutter clearance and aero-servo-elasticity. To generate a ROM about a particular non-linear mean solution, the dynamically time linear response must be extracted from the CFD code. A system identification and reduction scheme is then used to construct the ROM, a state space system, from the pulse responses. This system is of much lower order than the original non-linear CFD scheme, but is able to reproduce its behavior. The ROMs are in state space form and He, J., & Durlofsky, L.J. “Reduced-order modeling for compositional simulation by use of trajectory piecewise linearization”. SPE Journal, 201 14

26

so can easily be coupled to a structural model for aero elastic and aero servo elastic calculations. One advantage of the current approach is that the aerodynamic model is constructed independently of the structural model and thus a redesigned structure does not require a new ROM. It could be shown that the flutter boundary of a 2D airfoil can be reproduced by ROM of order 18, where the original CFD is of order 27,000. The use of ROMs enabled each flutter point to be calculated in less than 1/100th of the computing time compared to the full CFD. Over the years, 1D, 2D, 3D CFD software’s solutions have been used successfully for modeling thermo-fluid systems in automotive, aerospace, oil, gas, power and energy industries. 1D CFD systems allows analyses of a wide range of complex engineering problems. For example, engineers can rapidly and accurately analyze piping network of almost any size or complexity to establish design integrity. 2D CFD cross-section simulation is mostly used in airfoil design for aircraft or blade in pumps or compressors, turbines of turbo-machinery. However, the 3D CFD phenomena associated with these designs cannot be resolved in 2D simulations. In real world, all fluid flow problems are 3D in nature and very with time, however, with thoughtful care, the simulation of many components and systems can be run in fewer dimensions. If done properly, the results from a simplified solution can give just as meaningful insight, but with a fraction of computational effort. While the 1D CFD is best used for system level analysis to understand how different parts of a of system will interact, the 3D CFD is used for component level analysis to understand design tradeoffs of detailed parts design, as shown Figure 2.4. In summary, while 1D CFD are typically mush faster than 3D CFD calculations where it may take only minutes to perform and provide a relatively quick system overview, the 3D CFD is mainly used for design of individual components allowing engineers to understand how detail flow interacts with all manner complex geometry15. The question arises when to used 1D CFD vs. 3D CFD? While there is not a definitive answer, the strength and weakness of each approach lend themselves to two fairly defined arguments. When designing a single components or small subset of components, every inch of length or degree of

Figure 2.4

15

Mentor Graphics Corporation®.

1D vs 3D Analysis

27

curvature can make a difference. In these cases, when small changes to a single part of a system are crucial, or there are significant flow variations in multiple dimensions, 3D CFD is the obvious choice because of its ability to analyze complex geometry with extreme accuracy. However, these benefits come with drawbacks, which become more evident as the scale of design increases. When the design reaches beyond the component level, the computation requirement becomes too high and the simulations take too long to fit within development schedule. This is when 1D CFD is a good choice. Because the 1D approach simplifies the 3D geometry to the component level, usually characterized by some sort of performance data. This is uses much less computing power and usually faster than a comparable 3D model.

Case Study 1 - Designing Parameters of Test Stage Axial Turbine At the present, turbo-machine element design using integrated software is developing intensively. Using 3D simulation in turbine flow path remains very labor intensive and sufficiently hampers its usage. Therefore, unidirectional (1D) and axisymmetric (2D) analyses are still widely used. Gas turbine engine qualitative characters are determined by the concepts taken into account on early phases of engine component design. The turbine multidisciplinary optimization problems are topic of different research. Stage Design P1 P2 After 1D mean line calculation, a stageInlet Pressure, Pa 117000 130000 by-stage 2D Inlet Temperature, K 373 373 (axisymmetric) Outlet Pressure, Pa 100000 100000 calculation was performed to Rotation frequency, 1/s 7311 8212 determine the twist Nozzle vane mean diameter, m 0.2978 0.2978 laws of blades which Nozzle vane length, m 0.0822 0.0822 provide the highest Blade mean diameter, m 0.2986 0.2986 efficiency. The first design stage (P1), Blade length, m 0.0854 0.0854 was a prototype of Nozzle vane outlet gauge 20 17.2 Intermediate Nozzle vane at mean radius 24 17.5 Pressure turbine (IP) last stage of the large Nozzle vane at peripheral radius 28 17.8 steam turbine with Blade outlet gauging angle near hub 32 41 reaction at mean Blade at mean radius 29.7 26 radius such that provide axial flow Blade at peripheral radius 26 19 exit from the stage and had a twist by Table 2.1 Main Parameters of the test stages (P1) and (P2) the law of freevortex design . The second design of stage (P2) was purposed for testing the possibility of increase a load, preserving axial flow exit. The main parameters of the stage put on trial are presented on. In addition to the stage integral characteristics, axisymmetric computations provide the flow parameters distribution in axial gap along radius. At this method of loss components estimation along the radius is a subject of importance. The secondary losses were connected at the blade tip by a special algorithm. The

28

secondary losses were calculated for each station along the blade height fitting a local profile loss magnitude. In summary, the process can be envisioned through Figure 2.5. Blade Reverse Engineering as applied to Geometry Definition The airfoil planner shape can be derived by six control points using NURBS ((Non-Uniform Rotational B-Splines), defined as preliminary design. The airfoil geometry is generated on a planner design

1-D

•1-D basic •pitch-line / mean-line •cotour geometry blade parametres

2-D

Figure 2.5

•Blade Rverse Egineering •BladeBlade •ThroghFlow

3-D

•3-D multiStage calculation •3-D flow phenamena performanc e

Turbine flow design process

sections with sections arranged along the blade height following a selected rule16. A turbine designer may choose an approach for profiling, when the sections are profiled along the direction of streamlines. Then, airfoil centroids are placed upon a redial line where a skeleton generated from the section is covered with a surface that is a NURBS. In a process of planner sections constructions, a technique of profile shape optimization on the geometry and aerodynamics was applied. The blades that were used in the test turbine stage are the subject of particular interest from several points of view. First of all, the nozzle vane cascades are assembled from the profiles supplied with the trailing edge extensions. This is characterized by heightened strength properties at reasonably high efficiency and low sensitivity to inlet flow angle variation. Then, the specialty profiled cascades with divergent channels in hub zone capable of to provide hub reacting at moderate loss were used that permits increase loading. 3D Aerodynamic Computation Experience proves that any problem solved in 3D formulation which obviating 1D and 2D analyses is fought with a danger of missing in flow rate and efficiency determination, particularly when the shape defined with low accuracy. At the same time, unidirectional and axisymmetric components feature high reliability, high speed of operation and accuracy sufficient for conventional turbine design. 3D analysis is a laborious and sophisticated tool and modeling time invested is several orders of magnitude larger than 1D and 2D model. In addition, the designer needs to process and maintain specialize skilled for mesh generation, turbulence model

Figure 2.6

Profile of Blades

Moroz, L., Govorusсhenko, Y., Pagur, P., “Proceedings of GT2005 ASME Turbo Expo 2005: Power for Land, Sea and Air “, June 6-9, 2005, Reno-Tahoe, Nevada, USA. 16

29

selection, boundary application, etc. Indeed, all forthcoming of 3D analyses is compensated by its capabilities to quantitatively count the flow nuances such as secondary effect in the cascade and flow separation, which cannot be precisely detected in the low fidelity models.

Case Study 2 - Cooling Air Flow Rate The flow rate of cooling air through the heat exchangers is obviously a key parameter defining the performance of the system. It comes from two sources – the fans and the ram effect generated by the movement of the vehicle through the atmosphere. As far as the ram air is concerned there is a tradeoff between a desired high flow rate for good heat exchanger performance and a low flow rate for minimization of overall vehicle drag. The flow of air through the front end of the vehicle adds a typical 5% to the vehicle drag. There are three basic approaches to the establishment of front end air flow rate under particular operating conditions. At the most sophisticated level complete CFD analyses can be performed which model the detail of the air flow around the outside of the complete vehicle and through the vehicle front end and engine compartment including the various heat exchangers and even through the rotating fans. Figure 2.7 Typical Cooling System Network for Airflow Rate Figure 2.7 shows the CFD analysis of flow through vehicle front-end with streamlines and pressure contours. This method delivers a great deal of information about the system but is demanding in terms of computing effort17. At the intermediate level of complexity commercial software exists that allows networks of 1D components to be set up and the air flow distribution through them to be calculated. This can be valuable when problems arise such as air recirculation or significant temperature and/or flow rate distribution.

Reduced Order Model using Empirical Relationship Despite the sophistication of these development tools there exists a much simpler tool for the prediction of front end air flow that proves to be capable of delivering considerable insight into the way the system is behaving. It is based in a 1D model that characterizes the face air flow velocity through the heat exchangers, vR, in terms of a few non-dimensional constants: 1

 Fv02  ψ 0 u 02  2 vR    1  ζ R  ζ sys  ζ F  Where: 17

Mentor Graphics Corporation 2012.

Eq. 2.7

30

F V0 ψo u0 ζR ζSys ζF

measure of the effectiveness of the front end shape in delivering vehicle velocity maximum non-dimensional pressure coefficient of the fan fan tip speed pressure drop coefficient of the heat exchangers pressure drop coefficient of the remainder of the system (grill, engine compartment, etc.). pressure drop coefficient for the fan itself.

The appropriate values for the unknown system and fan constants (F, ψo, ζSys, ζF) are determined from wind tunnel measurements of air flow rates through the heat exchangers for ranges of different vehicle speeds and fan speeds. The pressure drop coefficients of the heat exchangers as functions of air flow rate are already known. The values of the system constants are extracted from the experimental dataset using non-linear optimization techniques. Knowledge of the values of these parameters for a system allows the air flow rate through the heat exchangers to be explored for any vehicle speed or fan speed and even allows the effects of different heat exchangers to be evaluated18.

18

See previous.

31

3 Computational Error and Uncertainty Quantification CFD is still a tool which requires that a user has a good understanding of uncertainties and errors that might spoil a CFD simulation. There exists no error control in CFD and any CFD simulation must be interpreted by an experienced user to have some credibility. Without some knowledge about possible errors and how they can be handled a CFD simulation cannot be trusted. Errors can occur at different places:        

Definition of the problem; What needs to be analyzed; Selection of the solution strategy; What physical models and what numerical tools should be used; Development of the computational model; How should the geometry and the numerical tools be set up; Analysis and interpretation of the results; How should the model be analyzed and the results be interpreted;

There exist many different definitions on errors. In this guide the errors are classified into four types of errors:  Problem definition errors,  Model errors,  Numerical errors and  User and Code errors. The sections below describe these errors and give some guidelines on how to avoid them. While the Uncertainty Quantification (UQ) and Error are commonly used interchangeably in everyday language, some basic definitions are warranted here. We follow the definitions of the AIAA Guidelines which defines Uncertainty as: "A potential deficiency in any phase or activity of the modeling process that is due to the lack of knowledge." And Error as: “A recognizable deficiency in any phase or activity of modeling and simulation that is not due to lack of knowledge. (AIAA G-077-1998). The key phrase differentiating the definitions of uncertainty and error is lack of knowledge. The key word in the definition of uncertainty is potential, which indicates that deficiencies may or may not exist. Lack of knowledge has primarily to do with lack of knowledge about the physical processes that go into building the model. Sensitivity and uncertainty analyses can be used to better determine uncertainty. Uncertainty applies to describing deficiencies in turbulence modeling for example. There is a lot about turbulence modeling that is not understood. One approach for determining the level of uncertainty and it effect on one's analysis is to run a number of simulations with a variety of turbulence models and see how the modeling affects the results19. The definition for error implies that the deficiency is identifiable upon examination. One can differentiate between local and global errors. Local errors refer to errors at a grid point or cell, whereas global errors refer to errors over the entire flow domain. We are interested here in the global error of the solution that accounts for the local error at each grid point but is more than just the sum of the local errors. Local errors are transported, advected, and diffused throughout the grid. The definition of error presented here is 19

Responsible NASA Official/Curator: John W. Slater

32

different than that an experimentalist may use, which is "the difference between the measured value and the exact value". Experimentalist usually defines uncertainty as "the estimate of error". These definitions are inadequate for computational simulations because the exact value is typically not known. Further these definitions link error with uncertainty. The definitions provided in the above paragraphs are more definite because they differentiate error and uncertainty according to what is known.

Classification of Errors Here we provide a classification or taxonomy of error; namely Acknowledge Errors and Unacknowledged Errors. According to NASA, acknowledged errors (examples include round-off error and discretization error) have procedures for identifying them and possibly removing them. Otherwise they can remain in the code with their error estimated and listed. Unacknowledged errors, on the other hand, (examples include computer programming errors or usage errors) have no set procedures for finding them and may continue within the code or simulation. 1 - Acknowledged Errors  Modeling Errors  Physical Modeling Errors  Geometry Modeling Errors  Discretization Errors  Spatial Discretization Errors (Discretization of Governing Equations)  Domain Discretization Errors (Grid Generation)  Grid Density (Grid Independence Study)  Topology  Grid Sensitivity  Error Estimation  High-Ordered Accurate (Richardson’s Extrapolation)  Residual Based (Truncation Method)  Temporal Discretization Errors  Iterative Convergence Errors  Computer Round-off Errors  Truncation Errors 2 – Un-acknowledged Errors  Code Errors  Usage Errors Each of these types of errors will be discussed below.

Physical Modeling Error Physical modeling errors are those due to uncertainty in the formulation of the model and deliberate simplifications of the model. These errors deal with the continuum model only. Converting the model to discrete form for the code is discussed as part of discretization errors. Errors in the modeling of the fluids or solids problem are concerned with the choice of the governing equations which are solved and models for the fluid or solid properties. Further, the issue of providing a well-posed

33

problem can contribute to modeling errors. Often modeling is required for turbulence quantities, transition, and boundary conditions. Mehta lists sources of uncertainty in physical models as    

the phenomenon is not thoroughly understood; parameters used in the model are known but with some degree of uncertainty; appropriate models are simplified, thus introducing uncertainty; Experimental confirmation of the models is not possible or is incomplete.

Even when a physical process is known to a high level of accuracy, a simplified model may be used within the CFD code for the convenience of a more efficient computation. Physical modeling errors are examined by performing validation studies that focus on certain models (i.e. inviscid flow, turbulent boundary layers, real-gas flows, etc...). It is essential to have an overview of the physics involved and how the problem can best be analyzed. Running a 2D simulation in order to understand secondary flows or running a steady simulation in order to understand transient behavior is of course no use. When assessing a CFD simulation the first thing to consider is what physical phenomena are important for the results and if the selected type of simulation is suitable to resolve this type of phenomena. For further information about selecting the most suitable type of simulation please see the previous chapter on deciding what type of simulation to perform. Once the type of simulation has been selected the next step is to select what type of physical models the simulation should use. The following points should be considered:

κ-ε

LES

Figure 3.1

κ-ω SST

Reynold Stress

Pressure Coefficient at 20% Chord Length using Different Turbulence Model

34

  

Gas data (incompressible/compressible, perfect gas/real gas, ...) Turbulence modeling (type of model, type of near-wall treatment, ...) Other models (combustion, sprays, ...)

When assessing model related errors it is important to know the features of the selected model and think carefully how these features and possible short comings might affect the predicted physical behavior. Using the wrong turbulence model or combustion model can completely destroy the results of a CFD simulation. Figure 3.1 depicts effects of different turbulence models on a turbine blade vortex at 20% chord length. Another example would be effect of different turbulence models on a steep obstacle as shown in Figure 3.2. As it evident, in both cases, it results in slightly different solution, and therefore, source of uncertainty and error.

Figure 3.2

Effects of Different Turbulence Models in a Steep Obstacle

Geometrical Modeling Errors It is almost always necessary to simplify the geometry in some form. When assessing a CFD simulation one should consider how the geometrical simplifications can affect the interesting physical phenomena. Typical geometrical errors are: Simplifications Small geometrical features like fillets, small steps or gaps etc. can often be disregarded. When disregarding this type of features one should consider if they might affect the important physics. For example, a very large fillet on the suction-side of a vane might affect corner separations near the end-walls. A large tip-leakage might affect the flow physics significantly in the upper part of a compressor. Tolerances and manufacturing discrepancies if the geometry has very large tolerances or is manufactured in a way which might produce a non-ideal shape or position it might be necessary to perform additional CFD simulations in order to cover the whole span of possible real geometries. For example, surface conditions, roughness, welds, steps, gaps etc. Often CFD simulations assume a perfectly smooth surface. A nonsmooth surface which might have welds, steps or even gaps will of course produce different results.

35

If the physical phenomena of interest might depend on the surface conditions these should of course be considered. Typical phenomena that might be dependent on this type of errors are transition prediction, leakage flows etc.

Spatial Discretization (Governing Equations) Errors In general, discretization error is defined as the difference between the numerical solution to the discretized equations and the exact solution to the partial differential (or integral) equations. Spatial Discretization of the governing equation is to facilitate the discrete equations over the domain discretization. There are currently four methods of discretization available. They are    

Taylor Series expansion Polynomial fitting Integral method Control Volume approach

Each of these has its own characteristics and sometimes possible to obtain exactly the same formulation by using all four method; especially for linear simple cases. Two different idea should be discussed. The first has to do with PDE themselves in regard to terms like “conservation form”, “conservation law form”, or “divergence form”. The difference Figure 3.3 Inviscid stencil with 1st between conservative and non-conservative representation of order cells in red and 2nd order equation has been discussed before. Normally for PDE which cells in green represents a physical conservation statement, this means the divergence of a physical quantity can be identified in the equation. The second idea is that of the Conservative properties of finite difference representation. Such PDE represents a conservation statement to the point. We strive to construct a finite difference representation which provide a good approximation to PDE in a small, local neighborhood involving a few grid points. The same conservation principal which gave rise to the PDE’s also apply to arbitrary large regions (control volume) through the use of Divergence theorem (Eq. 3.1), by converting a control volume to a control surface, to be integrated through.

⃗ .n ∭(V. ⃗F)dV = ∯(F ⃗ ) dS Eq. 3.1

V

S

Those finite difference schemes which maintain the discretize version of the conservation statement is said to have the conservation property. For majority of problems it is crucial. Differences between finite difference and finite volume method are subtle but could be generalized as: Finite difference  Approximates the governing equation at a point.  Finite difference methods were developed earlier, the analysis of methods is easier and further developed. Finite volume  Approximates the governing equation over a volume.

Figure 3.4 Viscous stencil with viscous cells in blue and 2nd order cells in green

36



Finite volume is the most physical in fluid mechanics codes, and is actually used in most codes.

To that end, several discretization techniques were tableted (tables 1 & 2) as previously which would be excellent guide to spatial discretization techniques, both explicit and implicit. Although high order schemes seems to perform better, but the high CPU costs prevent them to be implemented. In balance, based on recommendation from these table, only 2nd order or higher should be considered with their stability in mind. Further, to illustrates this, we Second-order spatial accuracy for the inviscid fluxes is achieved by using MUSCL extrapolation to reconstruct an approximate value of the primitive variables, each side of each cell face. The MUSCL scheme for inviscid uses a 13 point stencil per cell in three dimensions, as shown in Figure 3.3; with 1st order in red and 2nd Figure 3.5 Effect of 1st and 2nd Order Differencing Scheme in Error order cells in green. The viscous flux is calculated using a Green's theorem approach to calculate the derivatives at cell faces and central differencing is used to calculate the scalar values. The viscous fluxes require a further twelve points to be added to the inviscid stencil for a total of 25 cells in the stencil per cell in three dimensions, as shown in Figure 3.4; with blue and 2nd order cells in green (25 cells). Higher Order Discretization There is a big debate in CFD community whether to use the 1st or 2nd or higher order discretization. There are of course differences between 1st and 2nd order as depicted in in Figure 3.5. The 2nd order, i.e., central differencing, reaches the desired error much faster than 1st. But it remain murky for higher ordered differences. There are flows which diffusion dominated, versus convection. An example, consider is the supersonic flows (high Reynolds number or convective flows) vs subsonic (diffusive) one. In that case, the rule of thumb is to use the Pe number (Pe = RePr) rendering to Figure 3.6. Now when you plan to discretize the Navier-Stokes equations this notion apply on the discretization of the convective term of the NS equation. While centered scheme is 2nd order accurate when upwind scheme is only first order, the upwind scheme is more diffusive than centered scheme but it is Figure 3.6 Effect of Pe Number in balancing Diffusive also more stable than centered scheme and Convective Flows which can lead to some spurious

37

oscillations (dispersive errors). It is well known than when Peclet number is > 2 it is preferable to switch from centered scheme to upwind scheme. So due to this possible oscillating behavior of the centered scheme, especially on convective dominated flow (high re number) the use of the upwind scheme could stabilize the solution. In conclusion, the QUICK and third-order MUSCL discretization schemes may provide better accuracy than the 2nd order scheme. The QUICK scheme is applicable to quadrilateral or hexahedral meshes, while the MUSCL scheme is used on all types of meshes. In general, however, the second-order scheme is sufficient and the QUICK scheme will not provide significant improvements in accuracy. There are other fancy higher order schemes which been design for most designed and work in special occasion. For the time being, 2nd or 3rd order accuracy is deem to be sufficient for most applications.

Discretization Errors Mesh Density To minimize the effort, users are advised to consult with guidelines developed by ITTC, ASME, or Journal of Fluids Engineering before instigating a numerical simulation. These errors are due to the Discretization errors can either be spatial errors in space or temporal errors in time. Spatial discretization errors are what people normally call discretization errors. The effects of spatial discretization are the most pronounced and have rightfully been investigated in depth. These can be quantified by a sequence of systematic mesh refinement/coarsening as depicted later on. The errors and uncertainties, as well as the sensitivity of the solution with respect to mesh size could be estimated. To minimize the uncertainties and error within a numerical simulation, sets of parametric studies and comparisons are developed. These, are simple and yet time consuming and tedious, especially for real difference between the exact solution and the numerical representation of the solution in space. Describing exactly what discretization different codes use and what errors this might lead to is not possible here. Instead some general rules to avoid these errors can be summarized as: Use at least a 2nd order accurate scheme, preferably a 3rd order accurate scheme. Some general purpose codes have a 1st order upwind scheme as default; this is a very diffusive scheme that often gives too smooth results. For new applications always run a simulation with a finer mesh to see how grid independent your solution is. Be aware of checker-board errors. Checkerboard errors occur close to strong shocks and other large discontinuities and can be seen as a wavy pattern with a wavelength of two cells. Some schemes, especially those who behave like central differencing schemes, are more prone to checker-board errors. Upwind schemes are a bit better and schemes like TVD or chock-capturing schemes, are even better. (See Figure 3.7). Grid Independence Study To perform a Grid Independent Study, is fairly straight forward as seen in a compressible flow over a forwarding step size example (Figure 3.7): 1. Run the initial simulation on your initial mesh and ensure convergence of residual error to 10-4, monitor points are steady, and imbalances below 1%. If not refine the mesh and repeat. 2. Once you have met the convergence criteria above for your first simulation, refine the mesh globally so that you have finer cells throughout the domain. Generally we would aim for around 1.5 times the initial mesh size. Run the simulation and ensure that the residual error drops below 10-4, that the monitor points are steady, and that the imbalances are below 1%. At this point you need to compare the monitor point values from Step 2 against the values from Step 1. If they are the same (within your own allowable tolerance), then the mesh at Step 1 was accurate enough to capture the result. If the value at Step 2 is not within acceptable values of the Step 1 result, then this means that your solution is changing because of your

38

mesh resolution, and hence the solution is not yet independent of the mesh. In this case you will need to move to Step 3. 3. Because your solution is changing with the refinement of mesh, you have not yet achieved a mesh independent solution. You need to refine the mesh more, and repeat the process until you have a solution that is independent of the mesh. You should then always use the smallest mesh that gives you this mesh independent solution (to reduce your simulation run time)

Figure 3.7

Effects of mesh density on solution domain

Grid Topology Before we pay attention to the individual cell topology, we consider domain topology which are compared for the 2D case, namely H, C, and O topologies. Meshes with H-H and C-H topology were constructed for 3D comparison; however due to the incompatibility of the C-H structure on a sharp wing tip or trailing edge with the current solver, no C-H studies are included. Most of the studies were under lifting inviscid flow conditions. Multiple studies were conducted under turbulent conditions but only one is included. Overall when it comes to topology, the H mesh scores first place followed by the C mesh and the O mesh comes last. When it comes to mesh parameters, the studies show that with carefully chosen mesh spacing around the leading edge, good orthogonality and skewness factors, smooth spacing variation, and a reasonable number of nodes, excellent CFD results can be obtained from the mesh in terms of accuracy of computed functional, determined convergence order and adjoin error estimation. Now with regard to Topology of individual cells, three types are considered; Hexahedral, Tetrahedral, and Polyhedral. The solution on the polyhedral mesh produced

39

the lowest absolute residual value as evident in Fig and the number of iterations for each mesh type to reach the same level of convergence (10-4) for the pressure residual are shown. While there are minor differences in the converged pressure drop the simulations are in broad agreement on the overall value. The number of iterations for each mesh type to reach a steady state value for the pressure drop are shown accordingly. The number of volumetric counts for Polyhedral cells are less than other two, therefore, saving valuable time and effort in computation. (See Figure 3.8).

Figure 3.8

Domain Topology (O-Type, C-Type, and H-Type; from left to right)

Sources of Discretization Error Discretization error occurs during the approximate numerical solution of differential equations. Evaluation of discretization error requires knowing the exact solution for the governing equations which is generally not known for problems of practical interest. In such scenarios, a mathematically rigorous technique called the Method of Manufactured Solutions (MMS) can be used where a solution is manufactured and used as an exact solution. MMS is based upon the philosophy that code verification deals with the mathematics of the problem and hence arbitrary functions (with certain requirements as discussed later) can be selected as exact solutions20. Of the various sources of numerical error, discretization error is generally the largest and usually the most difficult to estimate. The goal here is to review the different approaches for estimating discretization error and to present a general framework for their classification. The first category of discretization error estimator is based on estimates of the exact solution to the differential equation which are higher-order accurate than the underlying numerical solutions and includes approaches such as underlying numerical solutions and includes approaches21:  Residual (i.e., the truncation error)  Gradient and Flux Calculation (different geometries) Residual includes discretization error transport equations, finite element residual methods, and adjoin method extensions. The discretization error has two components: one that is locally-generated and the other is transported from elsewhere in the domain. The transported component is called pollution error by the finite element community which can be used to relate the convergence of the numerical method, (i.e., truncation error). The truncation error is the difference between the discrete equations and the mathematical model equations. Thus the discretization error is transported in the

Aniruddha Choudhary, “Verification of Compressible and Incompressible Computational Fluid Dynamics Codes and Residual-based Mesh Adaptation”, Dissertation submitted to the faculty of the Virginia Polytechnic Institute and State University, 2014. 21 Christopher J. Roy, “ Review of Discretization Error Estimators in Scientific Computing”, 48th AIAA Aerospace Sciences Meeting Including the New Horizons Forum and Aerospace Exposition January 2010, Orlando, FL. 20

40

same manner as the underlying solution properties (e.g., it can be convected and diffused) and it is locally generated according to the truncation error. Case Study – Hypersonic Flow over an Axisymmetric Sphere-Cone An example of error transport for the Euler equations is shown below in Figure 3.9, which gives the error in the density for the inviscid, Mach 8 flow over an axisymmetric sphere-cone (Roy)22. The flow is from left to right, and large discretization errors are generated at the bow shock wave where the shock and the grid lines are misaligned. In the subsonic (i.e., elliptic) region of the flow immediately behind the normal shock, these errors are convected along the local streamlines. In the supersonic (hyperbolic) regions these errors propagate along characteristic Mach lines and reflect off the surface. Additional error is generated at the sphere-cone tangency point, which represents a singularity due to the discontinuity in the surface curvature. Errors from this region also propagate downstream along the characteristic Mach line. An adaptation process which is driven by the global error levels would adapt to the characteristic line emanating from the sphere-cone tangency point, which is not desired. An adaptation process driven by the local contribution to the error should adapt to the sphere-cone tangency point, thus obviating the need for adaption to the characteristic line that emanates from it.

Figure 3.9

Contours of Total Estimated Discretization Error in Density

Estimating Discretization Error There are a number of approaches available for estimating discretization error. These methods can be broadly categorized as a priori methods and a posteriori methods. The a priori methods are those that allow a bound to be placed on the discretization error before any numerical solution is even Roy, C. J. (2003). “Grid Convergence Error Analysis for Mixed-Order Numerical Schemes”, AIAA Journal, Vol. 41, No. 4, pp. 595-604. 22

41

computed. One approach to developing an a priori discretization error estimator is to perform a truncation error analysis for the scheme, relate the truncation error to the discretization error (e.g., through a discretization error transport equation), then develop some approximate bounds on the solution derivatives. The main failing of a priori error estimators is extremely is that the resulting error estimate greatly over-estimates the true discretization error. A priori methods are generally only useful for assessing the formal order of accuracy of discretization scheme. These methods provide an error estimate only after the numerical solution has been computed. They use the computed solution to the discrete equations, possibly with additional information supplied by the equations, to estimate the error relative to the exact solution to the mathematical model. The initial developments up to the early-1990s were mainly concentrated on linear, elliptic, scalar mathematical models. Up to this point, a posteriori error estimation was limited to analysis of the energy norm of the discretization error, which for Poisson’s equation can be written on element k as: p

‖ε‖k = [ ∫|∇u ⃗ h − ∇u ⃗ exact |2 dV] Vk Eq. 3.2

where ε the discretization error, h is a measure of the element size (e.g., Δx), and p is the formal order of accuracy of the method, Uh represents the solution to the discrete equations on a mesh with a representative cell length of h, and the exact solution to the mathematical model UExact. In general, the level of maturity for a posteriori error estimation methods is strongly problem dependent. All of the discretization error estimators to be discussed here were originally developed for elliptic problems. As a result, they tend to Mesh spacing L2 – norm Slope of segment work well for elliptic problems, but are not as 0.75 0.013831341 N/A well-developed for 0.375 0.00350755 1.97940 mathematical models 0.1875 0.00095693 1.87398 that are parabolic or hyperbolic in nature. The 0.09375 0.00023901 2.00129 level of complexity of the 0.046875 0.00006023 1.98851 problem is also an 0.0234375 0.00001505 2.00001 important issue. The error estimators work 0.0117188 0.00000375 2.00000 well for smooth, linear Table 3.1 Discretization Error for 2D Burger’s Equation problems with simple physics and geometries; however, strong nonlinearities, discontinuities, singularities, and physical and geometric complexity can significantly reduce the reliability and applicability of a posteriori discretization error estimation methods. Similarly, an investigation perfumed by [Yan and Ollivier-Gooch]23, where an Error Transport Equation (ETE) been developed to estimate the discretization error. The ETE is an auxiliary partial differential equation (PDE) derived from the primal one. We compare the accuracy of the resulting discretization error estimate from the linearized ETE and nonlinear ETE to solving the higher order primal problem. It was shown that for finite-volume discretization of 2D viscous burger’s equation on an unstructured mesh, the error estimate can exposed in Eq. 3.2, for comparing visually. For structured grid (Table 3.1), the same applied and the resulted in 2nd order accuracy Gary Kai Kin Yan, Carl Ollivier-Gooch, “Discretization Error Estimation by the Error Transport Equation on Unstructured Meshes Applications to Viscous Flows”, 54th AIAA Aerospace Sciences Meeting. 23

42

obtained for different grid density24. Other studies suggest improving finite-volume diffusive fluxes through better reconstruction [Sejekan and Ollivier-Gooch]25, where they believed inaccuracy originates as the error in the flux integral. The aim is to compute the gradient and flux more accurately at the cell boundaries and hence obtain a better flux integral for a slight increase in computational cost. 3.5.6.1 Case Study – Domain Discretization Error for the Transitional Flow over a Sharp Cone An example of using the Richardson extrapolation procedure as an error estimator was presented by [Roy and Blottner]26. They examined the hypersonic, transitional flow over a sharp cone. The system response quantity was the heat flux distribution along the surface. The surface heat flux is shown versus the axial coordinate in Figure 3.10-(a) for three systematically-refined mesh levels: fine (160×160 cells), medium (80×80 cells), and coarse (40×40 cells). Also shown are Richardson extrapolation results found from the fine and medium mesh solutions. The sharp rise in heat flux at x = 0.5 m is due to the specification of the location for transition from laminar to turbulent flow. In Figure 3.10-(b), the Richardson extrapolation results are used to estimate the discretization error in each of the numerical solutions. Neglecting the immediate vicinity of the transition location, the maximum estimated discretization errors are approximately 8%, 2%, and 0.5% for the coarse, medium, and fine meshes, respectively. The solutions thus appear to be converging as h → 0. Furthermore, these estimated errors display the expected hp reduction for these formally secondorder accurate computations. In the turbulent region, the maximum errors are also converging at the expected rate giving error estimates of approximately 4%, 1% and 0.25%.

(a) - Exact error

Figure 3.10

(b) - Estimated error

Exact error, Estimated error scheme for viscous Burgers’ equation (Courtesy of Yan and Ollivier-Gooch)

Temporal Discretization Errors Temporal discretization errors mainly effect transient simulations. However, some codes use a time marching method also for steady simulations and then a temporal discretization error might affect

I. Sadrehaghighi, “Verification & Validation”, Presented in CDI Marine, March 2011. Chandan B. Sejekana, Carl F. Ollivier-Goocha,”Improving Finite-Volume Diffusive Fluxes Through Better Reconstruction”, Computers & Fluids · August 2016. 26 Roy, C. J. and Blottner, F. G. (2003). “Methodology for Turbulence Model Validation: Application to hypersonic Transitional Flows,” Journal of Spacecraft and Rockets, Vol. 40, No. 3, pp. 313-324. 24 25

43

the final steady solution slightly. The discretization in time can be done with 1st or 2nd order schemes or a Runge-Kutta Method, which is more accurate and saves memory. Some codes can adapt the timestep, but often it is necessary to prescribe a time-step in advance. Think of the time-step as your grid in time and make sure that the grid-resolution in time is fine enough to resolve the highest in time and make sure that the grid-resolution in time is fine enough to resolve the highest frequencies. It is obvious to see the CFL number has a bearing on temporal accuracy. In analogy with (Spatial accuracy) arguments it is easy to see that a very small physical time step will give you very time-accurate results, though it would take more time.

(a) Surface Heat Flux

(b) Relative Discretization error

Figure 3.11

Relative discretization error for the transitional flow over a sharp cone

44

Too high a time step, not only reflects as a loss in temporal accuracy but also could affect the stability. adapt the time-step, but often it is necessary to prescribe a time-step in advance. Think of the timestep as your grid in time and make sure that the grid-resolution in time is fine enough to resolve the highest in time and make sure that the grid-resolution in time is fine enough to resolve the highest frequencies. It is obvious to see the CFL number has a bearing on temporal accuracy. According to some, compromise therefore is needed from practical engineering approach, To avoid problems with temporal discretization errors the following should be considered; Try to use at least the same order as special discretization in your temporal one. For Figure 3.12 Temporal Discretization Criteria example, if you use Crank-Nicolson which is 2nd order accurate, use a 2nd order scheme in time as well. For Euler Implicit which is 1st order, use a 1st order accurate, and so on. Accordingly, a smaller time step for Euler implicit scheme is not much helpful. An easier way is to look at "temporal refinement" in the same perspective as "spatial refinement" studies. On the same grid, CN has a slope of 2, when dt reduces, while EI would have a slope of 1. This means that to achieve the same error levels as CN, the dt must be lowered substantially for EI. Do a physical estimation of the typical frequencies in time of the phenomena that you are interested in and select a time-step which is fine enough to resolve these frequencies well. After the simulation also look at the frequencies captured and make sure that they are well resolved by the chosen time-step. For new applications try a finer time-step to ensure that your solution in time is fairly grid independent in time. This dictates a balancing act between spatial discretization and temporal one, as discussed earlier, and related directly to CFL number. It is customer to choose CFL number of 1 or less to be consistent as depicted to follow. For example, the 1D wave equation could be written as:

∂u ∂u α∆t n +α = 0 ≫ un+1 = unj − (uj − unj−1 ) j ⏟ ∂t ∂x ∆x CFL

And the effect of different CFL numbers as displayed.

Iterative Convergence Errors To judge when a CFD simulation is converged is not always that easy. Different codes and different applications behave very differently. Before we pay attention to the convergence issue, it is prudent to establish what the proper convergence level for the solution is. According to Wikipedia, in computational mathematics, an iterative method is a mathematical procedure that generates a sequence of improving approximate solutions for a class of problems. A specific implementation of an iterative method, including the termination criteria, is an algorithm of the iterative method. An iterative method is called convergent if the corresponding sequence converges for given initial approximations. A mathematically rigorous convergence analysis of an iterative method is usually performed; however, heuristic-based iterative methods are also common. In the problems of finding the root of an equation (or a solution of a system of equations), an iterative method uses an initial guess to generate successive approximations to a solution. For a pure aero-simulation on a fairly coarse grid convergence is easy to judge, but for more complex simulations involving resolved

45

boundary layers, heat transfer, combustion etc. convergence can be very tricky. Aside from looking at residuals one should always also look at how global parameters like static pressure distributions, total pressure losses, skin friction, heat transfer etc. change in time. To summarize at convergence, the following should be satisfied27:  

All discrete conservation equations (momentum, energy, etc.) are obeyed in all cells to a specified tolerance OR the solution no longer changes with subsequent iterations. Overall mass, momentum, energy, and scalar balances are achieved.

Simply put, for a time-marching or time-accurate strategy, this involves examining whether the final time has been reached with proper convergence at each time step. For a space-marching strategy, this involves examining whether the end of the marching segment has been reached with proper convergence at each marching step.

Figure 3.13

Effect of CFL Number on Convergence of 1D Wave Equation

Monitoring Convergence using Residual History Generally, a decrease in residuals by three orders of magnitude indicates at least qualitative convergence. At this point, the major flow features should be established. Scaled energy residual should decrease to 10-6 (for the pressure-based solver), and scaled species residual may need to decrease to 10-5 to achieve species balance. Monitoring Quantitative Convergence Two important aspects of quantitative monitoring are:  

Monitor other relevant key variables/physical quantities for a confirmation. Ensure that overall mass/heat/species conservation is satisfied.

In addition to residuals, you can also monitor lift, drag and moment coefficients and relevant variables or functions (e.g. surface integrals) at a boundary or any defined surface. Furthermore, in 27

Mike Kuron, M.S.M.E., Project Manager at CAE Associates

46

addition to monitoring residual and variable histories, you should also check for overall heat and mass balances. The net flux imbalance (shown in the GUI as Net Results) should be less than 1% of the smallest flux through the domain boundary. If solution monitors indicate that the solution is converged, but the solution is still changing or has a large mass/heat imbalance, this clearly indicates the solution is not yet converged. In this case, you need to reduce values of Convergence Criterion or disable Check Convergence in the Residual Monitors panel and continue iterations until the solution converges. Norms of Convergence Error A more attractive way to estimate the iterative error is to use norms of the change in the solution from one iteration to the other28. The iterative error is related to the non-linearity of the system of partial differential equations solved in CFD. There are several sources of non-linearity in the RANS equations:  

The convective terms. The usual linearization procedures are Picard or Newton methods, which imply an iterative solution. The turbulence closure. For example, one and two-equation eddy-viscosity models have nonlinear convective terms and non-linear production and dissipation terms. Also, the turbulence model equations are often solved segregated from the continuity and momentum equations.

Furthermore, the linear system of algebraic equations obtained from the discretization of the linearized partial differential equations is rarely solved with a direct method. Therefore, the flow solution includes an extra iterative cycle corresponding to the method applied in the solution of the linear systems of equations. In most flow solvers, no clear distinction is made between the various iterative cycles. Therefore, in the estimation of the iterative error it is important to point out the meaning of one iteration of the solution procedure.

L  (φ)  M ax Δφi

1  i  Np

NP

L1 (φ) 

NP

 Δφi i 1

NP

,

L 2 (φ) 

 ( Δφ ) i

i 1

2

Eq. 3.3

NP

Where NP stands for the total number of nodes of a given grid and ∆ϕ for the local change of the flow quantity ϕ. Two options were used for ∆ϕ; the variable change between consecutive iterations, ∆ϕd =ϕn - ϕn-1, and the normalized residual of the discretized equations, ∆ϕr. In using these options, there are some important practical details. The difference in ϕ between iterations, ∆ϕd, is readily evaluated. However, it may be affected by the use of under-relaxation in the calculation procedure. If implicit under-relaxation schemes are applied, as for example local time-stepping, ∆ϕd will reflect its influence correctly. On the other hand, explicit under-relaxation must be handled carefully; ∆ϕd should be calculated before the under-relaxation is applied, otherwise, the values of ∆ϕd will become artificially small. The relation between the residual of the discretized equations and the flow quantities depends on the method adopted. Nevertheless, in general, the normalized residual, ∆ϕr, is equivalent to the differences in the solution of a Jacobi iteration for the system of equations of a given L. Ec¸a, M. Hoekstra,” On the Influence of the Iterative Error in the Numerical Uncertainty of Ship Viscous Flow Calculations”, 26th Symposium on Naval Hydrodynamics Rome, Italy, 17-22 September 2006 28

47

iteration. The term normalized means that the main diagonal of the system is scaled to one in order to obtain a right-hand side which represents a change in the dependent variable. Then ∆ϕr is also a measure of the differences between consecutive iterations. The values of L1, L2 and L∞ obtained in any iteration n that satisfies the selected convergence criteria may be used as iterative error estimators. However, there is no guarantee that these values bound4 the iterative error, especially when the rate of convergence is small. Case Study – 2D Flow Over a Hill The calculations of the turbulent flow around the two-dimensional hill of 241x241 grid with Reynolds number of 6x104, were performed with the eddy viscosity one-equation model of Spalart & Allmaras. In each of the seven grids tested, the calculations were started from scratch copying the inlet profiles to the complete flow field. In this test case, we have chosen to monitor the behavior of the two Cartesian velocity components, U1 and U2 and the pressure coefficient, Cp. All the quantities presented are dimensionless using the hill height and the mean centerline velocity as the reference values. The reference pressure to compute Cp is the pressure at the outlet of the computational domain, i.e. Cp = 0 at the outlet. The convergence criterion is based on the maximum difference between consecutive iterations, L∞ (∆ϕd). Figure 3.14 presents the iterative error of U1 based on the solution converged to machine accuracy, in the finest grid. With the data shown it is easy to assess the quality of the iterative error estimators based on the last iteration performed. For the three levels of et plotted, the values of L2 (∆ (U1)d) of the last iteration performed are 10-4.3, 10-6.6 and 10-9.1. Iterative error estimations based only on the values of the last iteration performed are not reliable29. The L∞ is best suited for the iterative error estimation. The L2 norm is clearly worse, but still a better choice than the L1 norm. The results based on ∆ϕd are globally the most consistent. The most appropriate norm to perform iterative error estimation is the L∞ norm. The L2 and L1 norms do not lead to iterative error estimates which are representative of the complete computational domain. In most of the tested cases, the error Figure 3.14 Estimated Iteration Error of U1 for Different Level of estimators based on these Tolerance criteria et two norms do not bound the iterative error obtained from the difference with the solution converged to machine accuracy. The locations where such estimates fail to bind the iterative error may cover a significant part of the computational L. Ec¸a, M. Hoekstra,” On the Influence of the Iterative Error in the Numerical Uncertainty of Ship Viscous Flow Calculations”, 26th Symposium on Naval Hydrodynamics Rome, Italy, 17-22 September 2006. 29

48

domain. Estimates of the iterative error based only on the results of the last iteration, be it the change in the variables or the normalized residuals, are unreliable. In most cases, a major part of the computational domain exhibits iterative errors larger than these last iteration values. Indeed, the maximum values of the iterative error may be more than one order of magnitude larger than the differences between consecutive iterations or the normalized residuals of the last iteration. The extrapolation to an infinite number of iterations improves significantly the performance of the iterative error estimation. The least squares fit to a geometric progression seems to be a good option to make reliable estimates of the iterative error.

Computer Round off Errors Any commutated solution, may be affected by rounding to a finite number of digits30. In some calculations, the magnitude of round-off errors is proportional to the number of grid points in domain. In these cases, refining the grid may decrease the truncation error but increase the roundoff error. When using single precision care needs to be taken to avoid round-off errors. In-viscid Euler simulations and simulations using wall-function meshes can most often is performed in single precision. For well resolved boundary layers with Y plus close to 1 it is often necessary to use double precision. If using double precision for very fine mesh resolutions make sure that you also create the mesh in double precision and not just run the solver in double precision. Sometimes a single precision solver converges slower than a double precision solver due to numerical errors caused by round-off errors. When using advanced physical models like combustion, free-surface simulations, spray and transient simulations with quick mesh motions it is also often necessary to use double precision. A notorious example is the fate of the Ariane rocket launched on June 4, 1996 (European Space Agency 1996). In the 37th second of flight, the inertial reference system attempted to convert a 64-bit floating point number to a 16-bit number, but instead triggered an overflow error which was interpreted by the guidance system as flight data, causing the rocket to veer off course and be destroyed.31 Roundoff error in a numerical method is error that is caused by using a discrete number of significant digits to represent real numbers on a computer. Since computers can retain a large number of digits in a computation, round-off error is problematic only when the approximation requires that the computer subtract two numbers that are nearly identical. This is exactly what happens if we apply an approximation to intervals that are too small. Thus, the effort to decrease truncation error can have the unintended consequence of introducing significant round-off error.

Truncation Errors Truncation error represents the difference between the PDE and FDE and represented by order notation, as previously discussed. It is inversely related to the order of accuracy for the equations and would be extremely important criteria in accuracy of discretized equation as it is directly related to the stability consideration. Often, truncation error also includes discretization error, which is the error that arises from taking a finite number of steps in a computation to approximate an infinite process. The truncation error associated with 1-D heat equation can be expanded below (Eq. 21.4). It is obvious that better FDE approximation → smaller the Truncation error. But like many other things in CFD, there should be a balanced between FDE approximation and truncation error. For most practical applications, 2nd order accuracy would be sufficient32. Anderson, Dale A; Tannehill, John C; Plecher Richard H; 1984:”Computational Fluid Mechanics and Heat Transfer”, Hemisphere Publishing Corporation. 31 Weisstein, Eric W. "Round off Error." From Math World - A Wolfram Web Resource. 32 Gerald Recktenwald, 2006. 30

49

Alternatively, truncation error in a numerical method is error that is caused by using simple approximations to represent exact mathematical formulas. The only way to completely avoid truncation error is to use exact calculations. However, truncation error can be reduced by applying the same approximation to a larger number of smaller intervals or by switching to a better approximation. Analysis of truncation error is the single most important source of information about the theoretical characteristics that distinguish better methods from poorer ones. With a combination of theoretical analysis and numerical experiments, it is possible to estimate truncation error accurately.

− unj ∂u ∂2 u un+1 α j −α 2 = − (unj+1 − 2unj + unj+1 ) + 2 ⏟ (∆x) ∂t ∂t ∆t ⏟ PDE

FDE

∂2 u ∆t ∂4 u (∆x)2 + . . . .] [{− 2 } + {α 4 } ∂t 12 ⏟ ∂t 2 T.E.

Eq. 3.4

Practitioners of numerical approximation are most concerned with truncation error, but they also try to restrict their efforts at decreasing truncation error to improvements that do not introduce significant round-off error. Here, we consider only truncation error. We seek information about error on both a local and global scale. Local truncation error is the amount of truncation error that occurs in one step of a numerical approximation. Global truncation error is the amount of truncation error that accumulates in the use of a numerical approximation to solve a problem.

Code Errors Errors related to bugs in the code used or mistakes made by the Programmer. A general methodology called method of Manufactured Solutions (MS) has been proposed to circumvent the issue. As a physically realistic solution is not needed, the code verification being purely a mathematical assessment activity, not a physical one, one can suppose an arbitrary and analytic solution field. Then, with a mathematical derivation, where symbolic mathematical systems can help, the unknowns are replaced in the differential terms with the assumed solution. The obtained result, which will be different than zero, corresponds to a source term field for the original governing equations and a set of boundary conditions. This source term field will have a complicated but analytical expression and can be set in the simulation code. With this setup the simulation is launched. As the exact solution is known, it is possible to compute an error with an appropriate norm (typically L2 norm) which is the difference between the numerical result and the exact solution.

Benchmarking & Inter-Code Issues The CFD benchmarking project is a large collection of CFD benchmark configurations that are known from literature33. It stems from the need to compare the results of a CFD software not only by the "picture norm" (i.e. looking at the produced pictures and saying, "Oh, that software works quite accurate"), but to compare hard numerical numbers with reference values which were commonly accepted by the CFD community. The benchmark problems here will mainly be designed in 2D and aim to give a deeper understanding of how mathematical methods work in practice. This will hopefully also initiate further discussions about the reasonability of one or the other method. Of 33

Lehrstuhl III, Angewandte Mathematik und Numerik Technische Universität, Dortmund, Germany.

50

course we try to give references to literature wherever possible. There will also be section with "minibenchmarks", which contains very basic tests (sometimes even with analytical results). This section should serve as a reference to give developers of CFD codes a base for testing different solver components for correctness. A common issue that arises in CFD is the validation and testing of the code to be used for a computation. According to CFD on Line, some of the better known cases for validation purposes are:  1D test cases  Shock Tube Problem  2D test cases  2D Vortex in Isentropic Flow  2D Riemann Problem  2D Laminar/Turbulent Driven Square Cavity Flow  Circular Advection  Explosion Test in 2D  Lid-driven Cavity Problem  Jeffery-Hamel Flow  Flow Over Backward Facing Step (Laminar – Turbulent)  Flow Around a Circular Cylinder  Flow Across a Square Cylinder  NACA 0012 Airfoil  RAE 2822 Airfoil  Ringleb Flow  Scramjet Intake  Suddhoo-Hall Airfoil  Turbulent Flat-Plate  Viscous diffusion of multiple vortex system



Williams Airfoil

    

2D Ramp in Channel Problem 2D Single Mode Rayleigh-Taylor Instability 2D Single Mode Richtmyer-Meshkov Instability 2D Mach 3 Wind Tunnel With a Step Gresho Vortex

 3D test cases  Ahmed Body  Flow in the 180 degree U-Bend Square Duct  DARPA SUBOFF Model  Hypersonic Blunt Body Flow  ONERA M6 Wing  Turbomachinery  Eckardt Centrifugal Compressor  NASA Rotor 37 for Axial Rotors  NASA Rotor 67 for Axial Fans  3-D Single Mode Rayleigh-Taylor Instability  3-D Single Mode Richtmyer-Meshkov Instability  Free-Surface Piercing NACA 0024 Hydrofoil

51

 Transition test cases  2D Cascade Case Study 1 – Results of M6 Wing using NASA Codes of the Same Grid As an 3D example, consider M6 wing for comparing Aerodynamic quantities using NASA codes. The flow conditions are set to M = 0.82, α = 5 degrees, and Re = 107, with algebraic turbulence model used. Table 3.2 shows a picture norm of the results.

Aerodynamic Quantities CFL3D TLNS3D ENSAERO ADPAC OVERFLOW Table 3.2

Lift Coefficient

Drag Coefficient

Normal Forces Coefficient

0.5256 0.5254 0.5212 0.5217 0.5267

0.04097 0.04290 0.04230 0.03453 0.04422

0.5272 0.5271 0.5230 0.5227 0.5285

NASA Code Comparisons for Surface Forces in M6 Wing

Case Study 2 - Grid Convergence for 3D Benchmark Turbulent Flows Grid convergence studies are performed by [Diskin et al.]34 to establish reference solutions for benchmark 3D turbulent flows in support of the ongoing turbulence model verification and validation effort NASA. The benchmark cases are a subsonic flow around a hemisphere cylinder and a transonic flow around the ONERA M6 wing with a sharp trailing edge. The study applies widelyused CFD codes developed and supported at the NASA Langley Research Center, namely, FUN3D, USM3D, and CFL3D. A description of the code available in35. Reference steady-state solutions are computed for the RANS in conjunction with the Spalart-Allmaras turbulence model on families of consistently-refined grids composed of different types of cells. Coarse-to-fine and code-to-code solution variation is described in detail. For further details, readers should consult the work by36. 3.11.2.1 Subsonic Flow around a Hemisphere Cylinder Five grid families are generated for this study. Unstructured grids of families 1 to 4 have triangular faces on the hemisphere surface and no polar singularity. Each family has four levels of nested grids; L1 is the finest grid level, L2 is the second finest grid level, etc. Statistics of grids from families 2 (Tet), 4 (prism/hex), and 5 (structure) are shown in Table 3.3. These grids have a polar singularity at the axis attached to the apex of the hemisphere, i.e., along this polar axis, hexahedral cells degenerate into prismatic cells. Unstructured grids corresponding to the same level have the same distribution of grid nodes. In comparison with unstructured grids at the same level, family 5 (structure) grids have the same number of surface elements on the hemisphere surface, the same distribution of nodes on the cylinder surface, and more nodes on the hemisphere surface. All the L1 grids have the nearsurface normal spacing approximately corresponding to y+ = 0.5. FUN3D solutions are computed on grids of families 2 (Tet) and 4 (prism/hex); SFE solutions are computed on family 2 (Tet) grids, B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 35 See Previous. 36 See Previous. 34

52

USM3D solutions are computed on grids of families 2 (tet) and 5 (structure), and CFL3D solutions are computed on family 5 (structure) grids.

Table 3.3

Statistics of four finest grids for hemisphere cylinder grid families (Courtesy of [Diskin et al.])

3.11.2.2 Geometry, Flow Parameters, and Boundary Conditions The geometry is taken from the experimental study reported by [Tsieh]37. In the experiment, the radius of the hemisphere was 0.5 in., the body length was 10 in., and the unit Reynolds number per foot was 4.2x106. Thus, in the computational domain with the unit length taken as 1 in., the hemisphere radius is 0.5, the cylinder length is 9.5, and the Reynolds number is Re = 3.5 x 10 5 per unit length. The reference solutions have been computed at the following flow conditions: the reference Mach number Mref = 0.6, angles of attack of 0, 5, 10, 15, and 19 degrees, and the reference temperature Tref = 540 degree R. Here, presents only solutions corresponding to the 19 degrees angle of attack. The origin of the coordinate system is located at the apex of the hemisphere. The positive x direction is the stream wise direction collinear with the axis of the hemisphere and cylinder. Figure 3.15 sketches the layouts of boundary conditions and shows the global view of a computational grid

Figure 3.15

Global View of and Boundary Conditions (Courtesy of [Diskin et al.])

Tsieh, T., “An Investigation of Separated Flow About a Hemisphere Cylinder at 0 to 19 Degrees Incidence in the Mach Number Range from 0.6 to 1.5," AEDC-TR-76-112, 1976. 37

53

with half-plane symmetry. The downstream computational boundary is located at the back of the cylinder, x = 10. The outflow conditions specified at the downstream boundary are constant pressure conditions corresponding to P = Pref = 1. The far field boundary is a hemisphere with radius of 100 units centered at x = 10; y = 0; z = 0. 3.11.2.3 Results for Hemisphere Cylinder For solution visualization, Figure 3.16 presents the FUN3D solution computed on the prism/hex L1 grid. The pressure contours and streamlines are shown in two planes corresponding to y = 0, and x = 6.0. The pressure is non-dimensional by ρref a2ref , where ρref and aref are the dimensional freestream density and speed of sound, respectively. In the symmetry plane corresponding to y = 0, the crossstream separation is characterized by downward ow velocity. The separation occurs behind the hemisphere-cylinder junction and continues for the entire cylinder length. A minimum pressure is observed on the leeside, upstream of the hemisphere-cylinder junction. A large primal vortex and a smaller secondary vortex are shown in the cross flow planes corresponding to x = 6.0. The separation locations of these primal and secondary vortices are similar to those documented in the experiment. An off-body vortex is seen in the shear layer of the primal vortex, outboard of the secondary one.

Figure 3.16

Global View of Hemisphere Cylinder Pressure Contours using L1 grid at surfaces y = 0 (left) and x = 6 (right); (Courtesy of [Diskin et al.])

3.11.2.4 Forces and Pitching Moment Grid convergence plots of the lift, total drag (including pressure and viscous components), and pitching moment coefficients and the maximum eddy viscosity are shown in Figure 3.17. The value of the characteristic grid spacing, h, is computed as h = N-1/3, where N is the number of degrees of freedom (cells for USM3D and CFL3D, nodes for FUNFV38 and SFE). The aerodynamic coefficients computed with different codes on different grid families are generally converging to the same limit with grid refinement. Convergence of the maximum eddy viscosity is less clear, mainly because of the disagreement between limit projections from FUNFV (prism/hex) solutions and other solutions, even though the SFE and FUNFV (prism/hex) solutions agree well on the finest L1 grids. Overall codeto-code aerodynamic coefficient variation from the L4 grids to the L1 grids is up to 20%. In this

Two different discretization available in FUN3D are employed: the baseline finite-volume discretization (FUNFV) and a recently implemented stabilized finite-element discretization (SFE) based on a Streamlined Upwind Petrov-Galerkin formulation. 38

54

estimate and in the rest of the paper, relative variation is computed with respect to the middle of the variation range. Extrapolation to the infinite-grid limit is problematic because no reliable order of convergence can be established. No solution appears to converge uniformly in all quantities. Three solutions, USM3D (structure), FUNFV(prism/hex) and SFE, converge monotonically. Considering lift, USM3D (structure) solutions show less than first-order convergence, i.e., the lift approaches the limit from above with a concave shape. The FUNFV (prism/hex) and SFE lift curves approach the limit from above with convex shapes, indicating a convergence order that is higher than first order. The FUNFV (tet) lift appears to converge with first order on the three finer grids. The USM3D (tet) lift converges Table 3.4 Hemisphere Cylinder: Variation of Aerodynamic from above and changes the Coefficients on L1 Grids – (Courtesy of [Diskin et al.]) curve shape from concave to convex. Considering pressure drag convergence, the FUNFV (prism/hex) and SFE convergence curves approach the limit with convex shapes from above, but intersect. Lacking an exact solution, we use a quantitative characterization of observed solution variation to evaluate accuracy. Variation of the aerodynamic coefficients computed on the L1 grids is described in Table 3.4. The largest relative difference among all solutions is observed for the pitching moment and does not exceed 4.4%. Accuracy of aerodynamic coefficients improves proportionally to degrees of freedom used in CFD computations. This property is the foundation of all grid refinement studies. It also justifies the expectation of accuracy benefits from tetrahedral-grid cell-centered formulations that provide more degrees of freedom on grids of the same level. The USM3D solutions use about six times more degrees of freedom on grids of family 2 (tet) than other solutions on grids of the same level. Because grid convergence shown in Table 3.4 is not regular, quantitative assessments of accuracy improvements due to additional degrees of freedom are difficult and imprecise. Qualitatively, the aerodynamic coefficients computed by USM3D(tet) on the L2 grid are within the variation range of the L1 solutions. Looking at the grid convergence on the three finer grids, the maximum and minimum values of integrated aerodynamic quantities have been generally exhibited by the CFL3D solutions and the FUNFV (tet) solutions. (The only exception is that, for the viscous drag coefficient, the minimum is exhibited by the SFE solutions). Relative variation among the core-group L1 solutions is also shown in Table 3.4. The deviations of the CFL3D solutions from the core-group solutions may be attributed to the thinlayer approximation. The abnormalities in the FUNFV (tet) solutions observed on the current grids are harder to explain. In the limit of grid refinement, all FUNFV and SFE solutions are expected to converge to the same “infinite-grid" solution. On the current grids, nonphysical oscillatory solution modes resembling checker-board instabilities were observed in the FUNFV (tet) solutions with the default MUSCL scheme coefficient, κ = 0.0. The FUNFV solutions computed on grids of other families are smooth. Note that the default value of the MUSCL scheme coefficient on non-tetrahedral grids is κ= 0.5. In this study, an increased coefficient of κ = 0.75 is used for FUNFV (tet) solutions. Solutions with κ = 0.75 do not exhibit nonphysical oscillations, but appear to be somewhat less accurate. Although not shown, FUNFV solutions with κ = 0.0 were computed on tetrahedral grids by using the

55

approximate mapping discretization method for inviscid fluxes39-40. Approximate-mapping solutions do not exhibit nonphysical oscillations and provide aerodynamic coefficients well within the coregroup variation range.

Figure 3.17

Grid Convergence of Aerodynamic Forces for Hemisphere Cylinder (Courtesy of [Diskin et al.])

3.11.2.5 Fine Grid Surface Pressure, Skin Friction, and off-body variation In this section, surface pressure and skin friction are shown for four sets of solutions: USM3D (tet), FUNFV(prism/hex), SFE, and CFL3D solutions. Hereafter, only these four hemisphere-cylinder solutions of the available set are shown mainly for conciseness and presentation clarity. First, global

Diskin, B. and Thomas, J. L., “Comparison of Node-Centered and Cell-Centered Unstructured Finite-Volume Discretizations: Inviscid Fluxes," AIAA J., Vol. 49, No. 4, 2011, pp. 836-854. 40 Diskin, B. and Thomas, J. L., “Erratum: Comparison of Node-Centered and Cell-Centered Unstructured FiniteVolume Discretizations: Inviscid Fluxes," AIAA J., Vol. 51, No. 1, 2013, pp. 277. 39

56

views of solution variation on the L1 grids are shown. Figure 3.18 displays the surface pressure and the x-component of skin friction at the symmetry plane corresponding to y = 0. The pressure maximum indicating the leading edge stagnation is located near x = 0.03 on the windward side of the hemisphere. The pressure minima are observed on the leeside near x = 0.3 and on the windward side near x = 0.45. A zone of low pressure is also observed on the leeside at x > 4. Near the outflow boundary, the leeside pressure increases and the windward pressure decreases, creating a small negative-lift zone. The L1-grid surface pressure distributions computed with different codes are almost indistinguishable. The fine-grid code-to-code differences in the surface pressure at the local extrema located on the hemisphere are within 0.5%. Although comparisons with experimental data are not the focus of this paper, the computed surface pressure agrees qualitatively with the experimental measurements. Figure 3.18-(b) shows the x-component of the skin friction vector. Note that the y and z components of the skin friction are zero at the symmetry plane. The skin-friction profiles computed with different codes are similar in most places. The largest discrepancy of about 15% is observed on the leeside, near x = 6.5. Other places of noticeable but more local discrepancies are downstream from the hemisphere-cylinder junction at x = 0.5 and near the outflow are nondimensionalized by aref . The solutions are plotted along a vertical line attached to the upper surface of the cylinder at x = 5 and y = 0.21. The view is chosen to show solution variation across the core of the primal crossflow vortex located in this area. All off-body profiles are over plotted in the global view.

Figure 3.18

Global View of Surface Pressure and Skin Friction at symmetry plane (y = 0) for Hemisphere Cylinder – (Courtesy of [Diskin et al.])

3.11.2.6 Effect of Grid Refinement on Surface Pressure and Skin Friction Grid refinement and zooming have been applied to study solution variation near nontrivial flow features and near regions of the largest solution differences observed in global views. Although not shown, global views of the surface pressure profiles in the three planes (y = 0, z = 0, and x = 5) computed on four grids in corresponding families are hardly distinguishable. The results in [Diskin et al.]41 illustrates local grid convergence of the leeside surface pressure near the location of the minimum pressure. Even in the zoomed view, only the coarsest L4 grid solutions are clearly B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 41

57

distinguishable. All codes appear to approach the same limit. Variation between L1 solutions is less than 0.07%. The minimum pressure coefficients computed with USM3D, FUNFV, and CFL3D decrease monotonically with grid refinement and show similar coarse-to-fine grid variation of about 7%. The SFE minimum pressure coefficients show remarkably small coarse-to-fine grid variation of less than 0.5%, but converge non-monotonically with grid refinement. For complete discussion, please consult the work by [Diskin et al.]42. 3.11.2.7 Transonic Flow Around an M6 Wing The ONERA M6 experiment has been widely used for validation of CFD solvers43. A relatively simple, well-documented geometry and a rich experimental database for a large variety of ow conditions provide a unique combination for practical and inexpensive benchmark studies. Reference solutions for transonic flows around the M6 wing are presented in this section. Authors believe that the solutions computed on grids with more than 360 million degrees of freedom represent the largest M6 computations conducted to date. The grid generation, coarsening, partition, and multigrid capabilities for the M6 model are described in an accompanying paper44. The M6 wing geometry used in this study has been slightly redefined for numerical analysis of turbulence model simulations. 3.11.2.8 Geometry, Flow Parameters and Boundary Conditions Recently, a group at ONERA has considered the M6 model and its past experiments in greater detail4546. As part of this effort, the group has created a new CAD geometry for the wing. In this geometry, the trailing edge of the wing has been made sharp for the purpose of this particular CFD exercise. The reference solutions for the OM6 wing are computed at a freestream Mach number 0.84, Reynolds number 14.6 x 106 based on the unit root chord, and the angle of attack of 3.06 degrees. The far-field boundary in the shape of a hemisphere is located at 100 unit chords. The symmetry condition is assigned at the plane containing the root airfoil. Note that the experiment used a splitter plate near the wing root, which is not modelled by CFD codes. This discrepancy is believed to be the cause of disagreement between CFD solutions and experiment measurements at inboard sections. 3.11.2.9 Grids for M6 Wing The M6 grids used in this study are topologically equivalent to the full-geometry (y = 0 symmetry plane) hemisphere-cylinder grids described before. The cylinder surface is mapped on the wing surface with the specified wing section, and the hemisphere surface is mapped on the rounded wing tip. Five nested grid families have been generated for the M6 geometry by using input profiles available at the TMR website. Statistics of L4 - L1 grids from families 1 (prism), 2 (tet), 4 (prism/hex) and 5 (structure) are shown in Table 3.5. The far field boundary grids are not shown because they look similar to the full-geometry extension of those for the hemisphere-cylinder configuration. The surface grids have a moderate stretching toward the leading and trailing edges resulting in a relatively coarse grid spacing in the mid-chord region. All the L1 grids have the first node off the surface located at an average of approximately y+ = 0.5.

See Previous. Schmitt, V. and Charpin, F., “Pressure Distribution on the ONERA-M6-Wing at Transonic Mach Numbers," In Experimental Data Base for Computer Program Assessment. Report of the Fluid Dynamics Panel Working Group 04, AGARD AR 138, 1979. 44 Nishikawa, H. and Diskin, B., “Customized Grid Generation and Processing for Benchmark Three-Dimensional Flows," SciTech-2018, Kissimmee, FL, Jan., 2018, To be published as AIAA Paper. 45 Gleize, V., Dumont, A., Mayeur, J., and Destarac, D., “RANS simulations on TMR test cases and M6 wing with the ONERA elsA flow solver (Invited)," AIAA Paper 2015-1745, 2015. 46 Mayeur, J., Dumont, A., Gleize, V., and Destarac, D., “RANS simulations on TMR 3D test cases with the ONERA elsAfl ow solver," AIAA Paper 2016-1357, 2016. 42 43

58

Table 3.5

Statistics of Grids for OM6 Wing Grid Families

3.11.2.10 Results for M6 Wing Figure 3.19 presents the contours of the surface pressure computed by USM3D on the prism/hex L1 grid of family 4. The pressure is non-dimensionalized by ρref a2ref. A lambda shock is clearly visible on the surface with the shock intersection located at about 80% of the wingspan. Grid convergence of aerodynamic coefficients is described next. USM3D solutions have been computed on grids of families 2 (tet) and 4 (prism/hex); FUNFV solutions have been computed on grids of families 1 (prism) and 4 (prism/hex); and CFL3D solutions have been computed on structured grids of family 5 (structure). All computations have been conducted with no flux limiters. Figure 3.20 (a)-(b) show convergence of the lift, total drag. No solution converges monotonically for all plotted quantities; thus, no order property can be deduced from the observed convergence. Nevertheless, all solutions approach the same Figure 3.19 M6 wing: pressure contours computed by USM3D aerodynamic coefficient values in on family 4 prism/hex L1 grid - (Courtesy of [Diskin et al.]) the limit of grid refinement. The slopes of pitching moment convergence curves shown in Figure 3.21 are highly irregular for solutions on grid families 4 (prism/hex) and 5 (structure). For example, the pitching moment coefficient computed from the family 4 USM3D (prism/hex) solutions decreases initially with grid refinement from L4 grid to L3 grid, increases on L2 grid, and decreases again on L1 grid. Lift and pitching moment convergence observed for FUNFV (prism) and USM3D (tet) solutions is more regular. The differences among lift and pitching-moment coefficients computed by all codes on all grids do not exceed 6%. Drag coefficients appear to be converging with more regular slopes, but do not provide convergence patterns suitable for the infinite-grid extrapolation. The total and pressure drag coefficients computed from CFL3D and FUNFV solutions change the direction of convergence on the L1 grids. The viscous drag coefficient computed from the USM3D (tet) solution changes the direction of convergence on the L1 grid. Only USM3D (prism/hex) solutions converge monotonically for the three

59

drag coefficients. Relative variation of drag coefficients computed on different grids is more significant than variation of the lift and pitching moment coefficients; pressure and viscous drag coarse-to-_ne variation is approximately 30% and 16%, respectively. To establish solution accuracy, Table 3.6 shows code-to-code variation of the forces, pitching moment, and maximum eddy viscosity on the L1 grid. Among all integral aerodynamic coefficients, the maximum relative difference of 0.94% is observed for the pressure drag. Maximum eddy viscosity variation exceeds 10%, indicating considerably higher uncertainty than in integrated quantities. For quantities that converge regularly in grid refinement, e.g., lift (Figure 3.20-(a)) and pitching moment (Figure 3.21), the USM3D (tet) solutions appear to provide significant accuracy benefits on same-level grids. Variation of surface pressure coefficients

Table 3.6

Figure 3.20

M6 Grid Convergence of Aerodynamic Forces CL, CD

Variations of Aerodynamic Coefficients - (Courtesy of [Diskin et al.])

60

computed on the L1 grids at the measurement sections used in the 9 of 35 experiment is shown in [Diskin et al.]47. Only three computations, FUNFV (prism/hex), USM3D (prism/hex), and CFL3D (structure), are used in this section for succinctness. The three codes extract surface pressure at the same span wise locations specified48. In the global view, the L1 pressure profiles from different codes are in close agreement. Small oscillations in FUNFV solutions are observed near the shocks. All solutions place shocks at the same locations and identify the same pressure minima on the lower and upper wing surfaces. The pressure profiles at leading and trailing edges are indistinguishable. As compared to other studies, an improved agreement with the experiment is observed at section 4. This improvement is observed to be due to the increased grid resolution provided by the L1 grids. Figure 3.22 show a global view of leeside pressure grid refinement at sections 1 (η = 0.2). The pressure plots show significant variation with grid refinement. The mid chord grid spacing on L4 and L3 Figure 3.21 M6 Grid Convergence of Pitching Moment grids is too coarse to represent details of the pressure profiles; the corresponding coarse-grid solutions miss most of the shock structure and are significantly different from the solutions obtained on the fine grids. All solutions computed on L2 and L1 grids represent the shock details and agree to each other remarkably well. The grid convergence patterns of USM3D and CFL3D solutions are quite similar, as expected, because both codes use cell-centered formulations. FUNFV uses a node-centered formulation and exhibits a different convergence pattern. All codes identify the minimum of pressure at the same location49, η≈ 0.39. The code-to-code discrepancy in the minimum-pressure value is about 0.09%. Minimum pressure computed from all solutions converges monotonically with grid refinement and demonstrates at least a second-order convergence rate. Most of L4 and L3 solutions completely miss the double-shock structure in this region. Only the FUNFV L3 solution indicates a presence of a shock structure; USM3D and CFL3D L3 solutions miss it. However, all the L1 solutions predict a double-shock structure in this region and agree well with each other. All the L1 solutions predict a pressure plateau between two shocks at 0.3 < η < 0.35. The normalized x-direction grid spacing at this location is Δx/c ≈ 0.02, providing just four grid nodes across the plateau. In spite of the minimal grid resolution, the maximum code-to-code difference between pressure values on this plateau is less than 6%. 3.11.2.11

Concluding Remarks

B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 48 The wingspan is taken as b = 1.47601797621980 and the relative axial position was computed as x/c = (x xmin)/(xmax - xmin). 49 η = x/c. 47

61

Detailed grid-convergence studies for two benchmark (3D) flows have been conducted by [Diskin et al.]50 to establish reference solutions for Reynolds-Averaged Navier-Stokes (RANS) equations using a Spalart-Allmaras turbulence model. The benchmark flows are a subsonic flow around a hemisphere cylinder and a transonic flow around the ONERA M6 wing (M6) with a sharp trailing edge. The reference solutions have been computed with three widely used CFD codes developed at NASA Langley, FUN3D, USM3D, and CFL3D. The codes use different discretization approaches and iterative solution methods. Two different unstructuredgrid second-order node-centered discretization available in FUN3D are used for the hemisphere-cylinder computations: the FUNFV discretization uses a standard finite-volume scheme and the SFE discretization uses a recently added stabilized finite-element formulation. SFE is not used for M6 computations. USM3D uses an unstructured-grid second-order cellcentered finite-volume formulation. CFL3D uses a second-order cell-centered structured-grid formulation. Five families of consistently-refined nested grids of different topology have been generated for the studies, including both structured grids and unstructured grids with various types of elements. The finest family grids provide from 60 M to over 400 M degrees of freedom. To eliminate iterative errors, all solutions on all grids have converged to near machine-zero residual levels. Although turbulence model validation is not the focus of the paper, the reference solutions have been compared with available experimental data. The main push is assessing variation between CFD solutions computed with different codes on different families of consistently-redefined grids. All codes show close agreement in predicting aerodynamic coefficients for the separated flow around the hemisphere-cylinder configuration. The code-to-code discrepancy among all Figure 3.22 M6 section 1 (η = x/c = 0.2) View of leeside aerodynamic coefficients computed on Pressure Grid Refinement - (Courtesy of [Diskin et al.]) the _nest family grids is less than 4.5% B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 50

62

and variation among a core group of four solutions is less than 0.75%. The coefficients appear to converge to the same limit with grid refinement, but no convergence order can be discerned for the observed convergence. There is more uncertainty about the grid convergence limit of the maximum eddy viscosity. The surface pressure and skin friction in different _ne-grid solutions over plot in most global views. A local disagreement among the codes of about 15% is observed in the vicinity of the leeside just past the middle section of the cylinder. Various off-body solution components probed outside of this region also over plot. Local solution characteristics, such as surface pressure minima and the circumferential angle of vortex separation and reattachment locations, also converge to the same limit with grid refinement. The reference solutions compare well with available experimental data. The reference solutions for a transonic flow around M6 have been computed using the three formulations, FUNFV, USM3D, and CFL3D. The aerodynamic coefficients computed by different codes on the finest grids of different families agree well; the maximum difference among all coefficients does not exceed 0.73%. The difference in maximum eddy viscosity is 10.3%, which is much larger than the corresponding difference in the aerodynamic coefficients. The surface pressure computed with the three codes have been compared at seven OM6 wing sections. The pressure profiles computed on the finest grids over plot in the global views. Away from shocks, all the pressure profiles computed on the two finest grids are close to each other, within a 1-2% range. Increased grid resolution allows for an improved resolution of the lambda-shock feature that was a challenge in past M6 computations. As compared to previous studies available in the literature, the present solutions on the finest grids provide an improved agreement with the experiment. Further details is available in [Diskin et al.]51.

Usage Errors Usage errors are due to the application of the code in a less-than-accurate or improper manner. Usage errors may actually show up as modeling and discretization errors. The user sets the models, grid, algorithm, and inputs used in a simulation, which then establishes the accuracy of the simulation. There may be blatant errors, such as attempting to compute a known turbulent flow with an assumption of inviscid flow. A converged solution may be obtained; however, the conclusions drawn from the simulation may be incorrect. The errors may not be as evident, such as proper choice of turbulence model parameters for separated flows with shocks. The potential for usage errors increases with an increased level of options available in a CFD code. Usage errors are minimized through proper training and the accumulation of experience. A good source of information are obtained by [Melot, et al,]52 and [Roy, et al,]53.

What to trust and what not to? CFD is generally quite good at predicting surface static pressure distributions. With care CFD can also be used to predict performance, total-pressure losses and blade turning. Predicting separation, stall and off-design performance can be a challenge and results with non-attached flows should be interpreted with care. Heat transfer is often very difficult to predict accurately and it is common to obtain heat-transfer coefficients that are 100% wrong or more. Validation data is critical in order to be able to trust heat transfer simulations. Transition is almost impossible to predict accurately in B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 52 Matthieu Melot, Bernd Nennemann and Claire Deschenes, ”Verification of transport equations in a general purpose commercial CFD code”, 28th IAHR symposium on Hydraulic Machinery and Systems (IAHR2016). 53 C. J. Roy,1, C. C. Nelson, T. M. Smith, and C. C. Ober, ”Verification of Euler / Navier-Stokes Codes using the Method of Manufactured Solutions”, International Journal for Numerical Methods in Fluids, 2004. 51

63

general. However, there exist models that have been tuned to predict transition and these tend to give acceptable results for cases close to the ones they were tuned for. In general, time permitting, to contain the numerical errors associated with calculation of cell gradients and face interpolations, we must54 :        

Use higher-order discretization schemes (second-order upwind, MUSCL, etc.) Attempt to align grid with the flow to minimize the “false diffusion” Refine the mesh – do a mesh-independent study Sufficient mesh density is necessary to resolve salient features of flow Interpolation errors decrease with decreasing cell size Minimize variations in cell size in non-uniform meshes (mesh quality) Truncation error is minimized in a uniform mesh Minimize cell skewness and aspect ratio (mesh quality)

Verification and Validation for Computational Simulation There are inherent inaccuracies in any numerical simulation of continuum problems due to discretization of the domain. Depending on the size of discrete elements used, numerical instabilities and uncertainties are introduced. To identify and quantify the main source of these uncertainties, sets of verification and validation procedures are employed. While the terms are often used interchangeably, verification and validation are distinct. Verification is the assessment of the accuracy of the solution by comparing to known solutions; validation is the assessment of accuracy of simulation with benchmark experimental data. A more compact explanation is provided by (Roache - 1997), with “verification defined as solving the equations right”, and “validation is solving the right equations”. In short, verification deals with the mathematics; validation deals with the physics. (1997), with “verification defined as solving the equations right”, and “validation is solving the right equations”. In short, verification deals with the mathematics; where validation deals with the physics.

54

CFD Online forums, Convergence.

64

To minimize the uncertainties and error within a numerical simulation, sets of parametric studies and comparisons are developed. These, are simple and yet time consuming and tedious, especially for real applications. To minimize the effort, users are advised to consult with guidelines developed by ITTC, ASME, or Journal of Fluids Engineering before instigating a numerical simulation. Journal of Fluid Engineering guideline is depicted here as reference.

65

4 CFD in Biomedical Applications CFD is still emerging in biomedical application due to the complexity of human anatomy and human body fluid behavior55. Nevertheless, it is becoming more accessible and practicable by virtue of the advent of digital computer with high performance hardware and software. Since the importance of knowledge of body fluids and system components are expected to perform and bio-fluid physiology study has been growing over the last several years, the advancement of biomedical practices and technology has been stimulated. The biomedical research with the aid of CFD software is still emerging which incorporated the physiology and pathophysiology of cardiovascular system and respiratory system through simulation. Various researches of simulation and clinical results had been studied, particularly the analyses of blood flow and nasal airflow. In most researches, the blood flow analysis studied the circulation of blood of ventricle function, coronary artery and heart valves. Meanwhile, the nasal airflow analysis studied the basic airflow in human nose, drug delivery improvement and virtual surgery. The examples of CFD simulations applied in cardiovascular and respiratory systems are depicted as in Figure 4.1 (A-B), respectively. CFD modelling has already received tremendous attention from biomedical researches along with the development of medical devices. Furthermore, detailed characterization of complex physiology and the measurement of computation metrics can be determined by incorporating both imaging procedure and CFD simulation56. CFD models are continuously being interpreted into clinical tools for physicians to apply across the spectrum of various diseases of cardiovascular and respiratory systems. Therefore, this paper explores the CFD study using the state-of-the-art in clinical area, highlighting the biomedical applications. CFD plays an important role by offering chances for simulation prior to undertaking real commitment to develop medical interventions in the correct direction and to execute any medical design alteration. The researches of biomedical CFD applications received tremendous attention in the past few years due to the importance of computational medical simulations of circulatory functions. The (B) Respiratory system

(A) Cardiovascular system Figure 4.1

Example of CFD simulations in cardiovascular and respiratory systems

Ernnie Illyani Basri, Adi Azriff Basri, Vizy Nazira Riazuddin, Siti Farhana Shahwir, Mohammad Zuber, Kamarul Arifin Ahmad, “Computational Fluid Dynamics Study in Biomedical Applications: A Review”, International Journal of Fluids and Heat Transfer, Volume 1 Issue 2 June 2016. 56 P.D. Morris, A. Narracott, H. Von Tengg - kobligk, D. Alejandro, S. Soto, S. Hsiao, et al., “Computational fluid dynamics modelling in cardiovascular medicine”, (2015) 1–11. doi:10.1136/heartjnl-2015-308044. 55

66

biomedical CFD applications for cardiovascular and respiratory systems are discussed in the subsequent sub-sections.

Literature Survey in Biomedical CFD Cardiovascular Systems The adoption of CFD directed towards theoretically highly beneficial within cardiovascular medicine, clinical trials, improving diagnostic assessment and device design in order to predict physiological responses to intervention and compute the prior hemodynamics parameters that unable to be measured. Researches of CFD applications in regards to cardiovascular system are addressed the associated methodological, analytical assessment and result of three main physiologies of heart functions namely valves, arteries and ventricle. Cardiovascular is pertaining to the heart disease which is the major cause of death around the world. Heart valve disease is the common disease which causes by the narrowing of aortic valve or leaking of blood flow on the valve leaflet. Recent study, [Basri et al.]57 studied the hemodynamic properties of the effect different valve opening for 45˚, 62.5˚ and fully opening by using the combination of magnetic resonance imaging (MRI) and CFD simulation. The authors investigated the hemodynamic properties in terms of pressure, velocity and wall shear stress to determine blood behavior of severed aortic stenosis. The result shows the significant decrease of blood pressure on the small valve opening, which caused the obstruction of blood ejection due to narrowing of valve. Hence, the study found that the lower leaflet opening shown detrimental effect on blood flow and induced higher stress on the leaflets. Besides that, [Basri et al.] compared the normal aortic valve (fully open) and stenosis aortic valve (62.5˚ opening) through the study of hemodynamic properties. The authors used CFD simulation on a 3D aortic valve which imported from MRI data scan. The study observed an increased velocity by 13.7% and a reduced of 2.9% in the mass of blood entering at the aortic branches of stenosis aortic valve compared to normal aortic valve. Thus, the study proved a significant reduction of blood supply to provide blood to head, neck and arm of human body. Meanwhile, [Tan et al.]58 studied on a patient specific assessment of stenosis aortic valve and compared the aortic flow pattern before and after deploying trans catheter aortic valve. The authors carried out CFD simulation that incorporated MRI data scan to investigate the flow patterns of thoracic aortas in terms of velocity profile and wall shear stress. The result of flow patterns shows a reduction of 20% of jet flow at an instantaneous velocity streamlines and a lower time-averaged wall shear stress after implantation. Hence, the combination of imaging and simulation approach in this study led to an individual evaluation of the disturbed blood flow patterns and wall shear stress on the aorta before and after undergoing the implantation procedure. [Jamuna & Abnurajan]59 measured velocity and pressure of the blood flow through a patient-specific aorta in different conditions. Those conditions are normal aorta, aorta with plaque at the valve sinus side and aorta with needed bi-leaflet valve implant. The authors incorporated a computed tomography (CT) image of a patient-specific and analyzed by using CFD simulation. It is observed that the blood pattern after implanting a valve is similar to normal aorta, where an increased percentage of velocity and blood pressure are shown

A.A. Basri, M. Zubair, A.F.A. Aziz, R.M. Ali, M. Tamagawa, K.A. Ahmad, “Computational Fluid Dynamics Study of the Aortic Valve Opening on Hemodynamics Characteristics”, in: 2014 IEEE Conf. Biomed. Eng. Sci. 8 - 10 December 2014, Miri Sarawak, IEEE, 2014: pp. 99–102. doi:10.1109/IECBES.2014.7047660. 58 F.P.P. Tan, X.Y. Xu, R. Torii, N.B. Wood, N. Delahunty, M. Mullen, et al., “Comparison of Aortic Flow Patterns Before and After Transcatheter Aortic Valve Implantation”, Cardiovasc. Eng. Technol. 3 (2012) 123–135. doi:10.1007/s13239-011-0073-3. 59 J. C., M. Abnurajan, “Design of Patient Specific Prosthetic Aortic Valve and to Study its Computational Fluid Dynamics”, 3rd Int. Conf. Electron. Comput. Technol. 3 (2011) 355–360. 57

67

to be 58.5% and 81.8% respectively. [Sirois et al.]60 also studied the implantation of aortic valve on a patient-specific by using CT images and CFD simulations. The authors performed a quantitative analysis of the hemodynamic in terms of blood flow patterns before and after implantation procedure. A reduction of pressure drop at 25.27 mmHg and increased of effective orifice area from 0.53 -1.595 cm2 shown a significant result following the valve implantation. The assessment of hemodynamic properties are carried out considering parameters of wall shear stress, oscillatory shear index and average wall shear stress gradient of 30 patients. [Gao et al.]61 also studied the coronary artery disease of stent implantation as an interventional procedure for the disease treatment. The authors compared the blood flow before and after stent implantation and analyzed the parameters in terms of wall shear stress and blood velocity. From the study, the wall shear stress and blood velocity are greater at the region of stenosis prior to implanting the stent of by which the result shows a reduction of maximum flow rate in coronary artery and an increment value of wall shear stress after the implantation procedure. [Chaichana, et et.]62 studied the hemodynamic effects of simulated plaque in left coronary artery models of patient-specific coronary stenosis. Three parameters are measured namely wall shear stress, pressure gradient and flow velocities by using CFD analysis and compared between the presence and absence of plaques in the left coronary models during cardiac cycle. It is observed that highest pressure gradient in stenotic regions caused by the plaques and lower flow velocity areas found at post plaque locations but wall shear stress is similar at the stenotic regions. Respiratory Systems Despite the importance of cardiovascular system for blood circulation and nutrients transportation throughout the human body, respiratory system also plays an essential role for human lung function primarily for nasal breathing. A CFD-based analysis provides better understanding of airflow characteristic incorporated with fluid dynamics in nasal cavity to obtain functional and anatomical data. Researches of CFD applications in regards to respiratory system received attentions concerning the basic airflow studies on the physiology of nose, drug deposition and virtual surgery of surgical intervention. Recent studies conducted by combining computational analysis with imaging to gain significant realistic numerical simulations of respiratory system. [Segal et al.]63 studied the differences in respiratory flow patterns of four difference human nasal cavities by using MRI scans and CFD simulations. The study is conducted by performing numerical simulation of steady state inspiratory laminar airflow for flow rate of 15 L/min and compared the measurements in terms of streamline patterns, velocities and helicity values. The authors observed that the majority of flow passed through the middle and ventral regions of nasal passages; however it is found that there is variance of the amount and location of swirling flow among subjects. [Wen et al.]64 also simulated steady laminar nasal airflow for flow rate of 7.5 to 15 L/min to present flow patterns between the left and right nasal cavities by adopting CFD simulation software (FLUENT®) and CT scan images of human nasal cavity models. The authors measured the flow patterns features included high velocities in the constrictive nasal valve area region, vortex formations posterior to the nasal valve regions and high flow close to the septum walls. The results shows the nasal resistance value within the first 2-3 60 E.

Sirois, Q. Wang, W. Sun, “Fluid Simulation of a Transcatheter Aortic Valve Deployment into a Patient-Specific Aortic Root,” Cardiovasc. Eng. Technol. 2 (2011) 186–195. doi:10.1007/s13239-011-0037-7. 61 F. Gao, G. Li, R. Hu, H. Okada, “Computational Fluid Dynamic Analysis of Coronary Artery Stenting”, Int. J. Biosci. Biochem. Bioinforma. 4 (2014) 155–159. doi:10.7763/IJBBB.2014.V4.330. 62 T. Chaichana, Z. Sun, J. Jewkes, “Computational fluid dynamics analysis of the effect of plaques in the left coronary artery.”, Computational Math. Methods Med. 2012 (2012) 504367. doi:10.1155/2012/504367. 63 R. a. Segal, G.M. Kepler, J.S. Kimbell, “Effects of differences in nasal anatomy on airflow distribution: A comparison of four individuals at rest”, Ann. Biomed. Eng. 36 (2008) 1870–1882. 64 J. Wen, K. Inthavong, J. Tu, S. Wang, “Numerical simulations for detailed airflow dynamics in a human nasal cavity, Respir”. Physiol. Neurobiol. 161 (2008) 125–135. doi:10.1016/j.resp.2008.01.012.

68

cm contribute up to 50% of the total airway resistance and vortices were found at the upper olfactory region and posterior to the nasal valve region. Croce et al.65 also simulated steady state laminar airflow for flow rate of 353 ml/s in left and right nostrils using CFD simulation software (FLUENT®) from CT scan images of a palatinate head using a commercial software package AMIRA (Mercury Computer System, Berlin). The authors described the flow patterns in a physiologically realistic bi-nasal model considering the pressure drop. The results found that the major total pressure drop in the nasal valve region and predominant airflow in the inferior median part of nasal cavities. Vortices also are observed downstream from the nasal valve and towards the olfactory region. Other than the basic airflow studies on the physiological function of the nose, drug deposition is of fundamental importance in the treatment of different lung disease and allergies. The recent study of CFD in relations to drug deposition received great interest in order to characterize the local deposition patterns and optimize drug delivery in the respiratory system. [Bahmanzadeh et al.]66 studied the effect of endoscopic sphenoidotomy surgery on the flow patterns and deposition of micro-particles in the human nasal passage and sphenoid sinus. The authors presented transient airflow patterns of pre- and postsurgery during a full breathing cycle under cyclic flow condition. The transport and deposition of inhaled micro-particles are evaluated by using Lagrangian approach to determine the unsteady particle which entering the nasal airway for inhalation phase of breathing cycle. The study found that the increased airflow due to sphenoidotomy and increased deposition of micro-particles in the sphenoid region. In the post-operative case, 25μm particle size is observed to be able to penetrate into the sphenoid region and highest deposition for 10μm particles at about 1.5% occurred during resting breathing. [Dastan et al.]67 studied the deposition of fibrous particle in different human nasal passages by using CFD simulations. The authors developed an in-house code to solve the combined equations of translational and rotational of motion of ellipsoids for fiber transport and deposition in the nasal airways. The result shows a significant effect of deposition fraction by virtue of variation of nasal airways. The deposition fraction is highly affected by the nasal geometry and of airflow rate in the nasal valve and main airway regions. Hence, it is proven that the aerodynamic diameter based on the Stokes equivalent diameter employed in the impaction parameter could collapse the simulation data of spherical and fibrous particles to a single curve. [Abouali et al.]68 studied the airflow distribution and particle deposition in the nasal airway, maxillary and frontal sinuses on the developed virtual ungiectomy and middle meatalantrostomy. The study considered the inhalation of macro and Nanoparticles to determine the penetration of airflow into the sinus cavity. The micro-particle consists of the evaluation of the path and deposition of particles in the nasal passages and maxillary sinuses by using a Lagrangian trajectory analysis approach. Meanwhile, the Nano-particles included the transport and deposition analysis by using a diffusion model. The rate of particle deposition in the maxillary and frontal sinuses are analyzed and compared between pre and post-surgery conditions. The result shows that almost no particles entered the sinuses in the pre-operative condition. However, the inhaled Nano and micro particles easily entered the sinuses due to the increase of C. Croce, R. Fodil, M. Durand, G. Sbirlea-Apiou, G. Caillibotte, J.-F. Papon, et al., “In Vitro Experiments and Numerical Simulations of Airflow in Realistic Nasal Airway Geometry”, Ann. Biomed. Eng. 34 (2006) 997–1007. doi:10.1007/s10439-006-9094-8. 66 H. Bahmanzadeh, O. Abouali, M. Faramarzi, G. Ahmadi, “Numerical simulation of airflow and micro-particle deposition in human nasal airway pre- and post-virtual sphenoidotomy surgery”., Comput. Biol. Med. 61 (2015) 8–18. doi:10.1016/j.compbiomed.2015.03.015. 67 A. Dastan, O. Abouali, G. Ahmadi, “CFD simulation of total and regional fiber deposition in human nasal cavities”, J. Aerosol Sci. 69 (2014) 132–149. doi:10.1016/j.jaerosci.2013.12.008. 68 O. Abouali, E. Keshavarzian, P. Farhadi Ghalati, A. Faramarzi, G. Ahmadi, M.H. Bagheri, “Micro and nanoparticle deposition in human nasal passage pre and post virtual maxillary sinus endoscopic surgery”, Respir. Physiol. Neurobiol. 181 (2012) 335–345. doi:10.1016/j.resp.2012.03.002. 65

69

airflow penetration into the sinus cavity after surgery. Despite that, virtual surgery in relation to CFD simulation also received great interest as to determine the best possible surgical treatment in a constricted airway69. In most studies, the virtual surgery consists of removing one or both of the obstruction in different proportions in order to enhance the nasal airway comparing to its baseline condition. The recent study by [Moghadas et al.]70 studied the effect of septal deviation on the flow patterns and deposition of micro/nanoparticles in the realistic human nasal airways before and after septoplasty. The authors simulated the steady airflows through the nasal passage by using Eulerian and Lagrangian approaches for Nano- and micro- particles. From the simulation, the results shows the flow field and particle deposition depending on the passage geometry. For micro-particles, the deposition rate with septal deviation is higher compared to the normal and post-operative passage. Meanwhile, the deposition of Nano-particles shows similar trends for both normal and postoperative passage. Hence, the aid of simulation provides a suitable tool for predicting the airflow and particle deposition patterns in the nasal passages that specific surgical interventions would produce. [Xiong et al.]71 compared nasal airflow after two different surgical interventions involving three facets such as opening the paranasal sinuses, excising the ethmoid sinuses, and excising or preserving the uncinated process, in a cadaveric head model through CFD simulations. The study found a significant large nasal cavity airflow velocity changes are apparent during the procedure of uncinated process and similar nasal cavity airflow when preserving the uncinated process. The uncinated process excising procedure shows a greater increase in airflow volume compared with the uncinated process preserving procedure. Previously, [Xiong et al.]72 carried out a numerical simulation of nasal cavity airflow pre and post virtual functional endoscopic surgery (FESS) with the aid of CFD simulations (FLUENT). The authors aim to investigate and numerically visualize the airflow trace, distribution, velocity, air pressure and airflow exchange between the nasal cavity and paranasal sinus on a normal adult subject. The result shows an increased airflow distribution in the maxillary, ethmoid and sphenoid sinuses, and the increment of 13% through the area connecting the middle meatus and the surgically opened ethmoid. On the other hand, Garcia et al.73 used CFD simulations of medical imaging software (MIMICs, Materialize) to study the airflow, water transport, and heat transfer in the nose of an Atrophic Rhinitis (AR). The subject is a patient that received a treatment of a nasal cavity-narrowing procedure which implanted a rib cartilage under the mucosa along the floor of the nose and removed septum spur. The reconstructed nose is simulated and the nasal airflow was assumed as laminar with 15 L/min corresponding to resting breathing rate. The simulation shows that the anthropic nose lead to unconditioned inspired air as effectively as the healthy geometries.

Merits and Limitations of Biomedical Applications in CFD The CFD model is applied as what described before, but now includes in Clinical Imaging (MRI, CT scan, etc. ) in preprocessing phase74. The validation is of course can be done with the same tools in post processing. The whole procedure is shown in Figure 4.1. CFD has received increasing interest G. Mylavarapu, “Computational Flow Modeling of Human Upper Airway Breathing, University of Cincinnati”, 2013. http://gradworks.umi.com/36/01/3601415.html. 70 H. Moghadas, O. Abouali, a. Faramarzi, G. Ahmadi, “Numerical investigation of septal deviation effect on deposition of nano/microparticles in human nasal passage”, Respir. Physiol. Neurobiol. 177 (2011) 9–18. 71 G.-X. Xiong, J.-M. Zhan, K.-J. Zuo, L.-W. Rong, J.-F. Li, G. Xu, “Use of computational fluid dynamics to study the influence of the uncinated process on nasal airflow”., J. Laryngol. Otol. 125 (2011). 72 G. Xiong, J.-M. Zhan, H.-Y. Jiang, J.-F. Li, L.-W. Rong, G. Xu, “Computational fluid dynamics simulation of airflow in the normal nasal cavity and paranasal sinuses”, Am. J. Rhinol. 22 (2008) 477 – 482. 73 G.J.M. Garcia, N. Bailie, a Martins, J.S. Kimbell, G. Gj, N. Bailie, et al., Atrophic rhinitis : “a CFD study of air conditioning in the nasal cavity”, 27709 (2007) 1082–1092. doi:10.1152/japplphysiol.01118.2006. 74 P.D. Morris, A. Narracott, H. Von Tengg-kobligk, D. Alejandro, S. Soto, S. Hsiao, et al., “Computational fluid dynamics modelling in cardiovascular medicine”, 2015, 1–11. doi:10.1136/heartjnl-2015-308044. 69

70

from mathematical curiosity to become an important technique to study complex physiological flows pattern and demonstrating their potential especially in cardiovascular and respiratory systems. To date, CFD has been adopted by medical researchers to facilitate in predicting the characteristic of circulatory blood flow inside the human body and airflow in human nasal breathing. Hence, it offer benefits such as lower the chances of postoperatives complications, facilitate in developing better surgical treatment, high efficiency with less destructive medical equipment and convey a good understanding of biological procedures75. From theoretical point of view, CFD provides benefits by concentrating on the construction and solution of governing equations and the study of numerous approximations to these equations. Meanwhile, the experimental and numerical approaches highlighted the merit of CFD as an alternative cost-effective means of simulating real fluid flow, particularly involving human body systems. Hence, it provides detailed visual and comprehensive information when comparing the fluid dynamics of analytical and experimental approaches. Despite the merits of CFD, there are also some limitations of applying CFD. CFD is limited to describe physical models and quality of input data of real world processes in order to determine the accurate CFD solutions such as turbulence, multiphase flow, and compressibility. Thus, numerical results must be thoroughly analyzed and examined in order to properly make critical judgements about the computed results. Figure 4.2 CFD Model Construction for Biomedical Furthermore, numerical errors may occur Application when solving equations on a computer invariable such as round-off error and truncation error. This is due to the practicability of CFD depending on several factors such as specific materials and process, accurate algorithm for the governing equations, powerful CFD packages, as well as high speed and large computers.

B. K. Lee, “Computational fluid dynamics in cardiovascular disease”., Korean Circ. J. 41 (2011) 423–30. doi:10.4070/kcj.2011.41.8.423. 75

71

Hemodynamic Flow Modeling The equations describing incompressible flows may be written as

.v  0

,

 v   .v   σ  f  t 



,

σ  -pI  τ

Eq. 4.1

where v is the velocity vector, σ is the stress tensor f, and is the external or body force which is assumed zero here and τ is the stress tensor can be decomposed to hydrostatic and deviatory stress which is a function of the shear rate tensor (D) as

𝛕 = μ(γ̇ )𝐃

,

1 𝐃 = (∇𝐯 + ∇𝐯 𝐓 ) 2

,

1 γ̇ = √ ∑ ∑ 𝐃𝐢𝐣 𝐃𝐣𝐢 2 i

j

Eq. 4.2

where μ is the dynamic blood viscosity, and ϔ is the shear rate. Blood is a non-Newtonian fluid, implying that the viscosity μ depends on the strain rate tensor. The last two decades have seen impressive progress in our ability to solve these equations in an expedient manner. Key elements of any modern incompressible flow solver include such as an Arbitrary Lagrangian-Eulerian (ALE) formulation for moving walls (deforming grids) with implicit time stepping. To employ the system of equations, constitutive equations are needed to calculate the viscosity of the blood. Previously, a variety of constitutive equations have been proposed to model blood flow. The simplest model is a Newtonian fluid which assumes a constant viscosity (μ = μ0). Recent studies, however, suggested that the shear dependent viscosity models can accurately capture shear thinning nature of blood flow. The most common non-Newtonian models used for the blood is the Power law which can be expressed in the following from:

μ = k(γ̇ )n−1 Eq. 4.3

where k is the flow consistency index and n is the power law index, showing the non-Newtonian behavior of the blood76. This mathematical description is one of the simplest models used for representing the behavior of a non-Newtonian fluid. However, since this model ranges from zero shear rates to infinity when shear rate approaches zero, only values in the realistic range can approximate a non-Newtonian fluid behavior. The power law index is usually chosen so that the model reproduces the shear thinning behavior of the blood in hemodynamic simulations. Regardless, both k and n depend the components of blood, mainly hematocrit, and are subject to change for each individual77. Following equation gives the dynamic viscosity with regards to this viscosity model78.

Boundary Conditions The imposition of proper flow boundary conditions represents one of the most difficult, and 76 Hussain, M.A., Kar, S., Puniyani, R.R., “Relationship between power law coefficients and major blood constituents

affecting the whole blood viscosity”. J. Biosci. 24(3), 329–337 ,1999. 77 Cho, Y.I., Kensey, K.R.: “Effects of the non-Newtonian viscosity of blood on flows in a diseased arterial vessel. Part 1: steady flows”. Biorheology 28, 241–262, 1991. 78 Hamidreza Gharahi, Byron A. Zambrano, David C. Zhu, J. Kevin DeMarco, Seungik Baek, “Computational fluid dynamic simulation of human carotid artery bifurcation based on anatomy and volumetric blood flow rate measured with magnetic resonance imaging”, International J Advanced Engineering Science Applied Math DOI 10.1007/s12572-016-0161.

72

admittedly questionable, aspects of patient-specific simulations. In the first place, the flux data is not easy to obtain. Measuring velocity prowls via Phase-Contrast MRA (PCMRA) requires non-standard imaging protocols and a longer scanning time. Then there is the question of resolution. The number of pixels required for accurate vessel geometry reconstruction is much lower than the number of pixels required for accurate flow profile reconstruction. Only the velocity normal to the MRA cut is measured, i.e. a complete characterization of the velocity field would require even longer scanning times. For this reason only the velocity normal to a cut is measured, i.e. all cross velocity information is lost. For some vessels, peak velocities can be measured using ultrasound techniques, and these can in turn be used to impose boundary conditions. On the other hand, we know that the ow in curved tubular structures can exhibit considerable cross ow, and that any form of cross ow can have significant effects downstream. To date, most CFD simulations have been carried out prescribing fully developed, time dependent velocity profiles derived from flow-rate curves using the Womersley solution. The Womersley solution holds only for pulsating flow in an infinitely long circular cylinder. For other vessel cross-sections the Womersley profiles are mapped accordingly. Pressure boundary conditions are important for fluid-structure interactions (FSI) simulations with compliant walls. Pressures can be obtained invasively using catheters, but it would be highly desirable to develop noninvasive pressure measuring techniques. Major outstanding problems in this field are:  

The derivation of post-operative boundary conditions from pre-operative data, & The derivation of boundary conditions when complete information is unavailable.

We do not expect to be able to obtain complete flow and pressure data for this complex arterial system for years to come.

Structural Deformation Models Arterial wall movement will have a profound effect on local ow conditions. One observes that fluxes do not 'add up' if the deformation of the wall is neglected. In principle, the vessel wall and the surrounding medium can be modeled using a structural dynamics solver for 3D nonlinear, largedeformation behavior. However, the difficulties in obtaining proper initial and boundary conditions are even more pronounced here than for the ow solver. The material is highly nonlinear, orthotropic, layered, may be responding, etc. How to obtain this information non-invasively is, at this point, an open question. For this reason, most wall deformations have been computed using shells [Per, Zha] or, even simpler, an independent ring model [Qua]. In this case, the normal wall displacement η is obtained from:

m ηtt  d ηt  k η  p , m  ρ w h , k 

Eh (1  ν 2 )r 2

Eq. 4.4

where ρw, h, r, E, ν denote, respectively, the wall density, thickness, vessel radius, Young's modulus and Poisson ratio. This equation is integrated using a second-order implicit time integration scheme.

Fluid-Structure Interaction Techniques Given that vessel deformation plays an important role for local flux evaluations, the fluid and structure models must be combined79. Due to their generality, modularity and extendibility so-called loose coupling techniques have been used extensively in engineering. The key idea is to have a master Rainald Lohner, Juan Cebral, Orlando Soto, Peter Yim, James E. Burgess, “CFD in Medicine and Life Sciences Applications on the Living Human Being”, George Mason University, Fairfax, VA 22030-4444, USA. 79

73

code that invokes the fluid and structural codes alternatively in such a way that a minimum of changes are required for the latter. For implicit CFD and CSD codes, we use the following under relaxed predictor- corrector scheme for each time step; while not converged:

while : not convergerd update structure with fluid load : x Si  (1  α) x Si 1  α f (σ iF ) update fluid with structure position/velocity σ  (1  α) σ i F

i 1 F

Eq. 4.5

 α g (x ) i S

end while Typical under relaxation factors are in the range 0.5 ≤ α ≤ 0.9. Subscripts F and S denote the fluid and solid phases respectively. The xi s is the structure response to fluid forces f(σiF), is the structure surface deformation due to fluid forces and g(xi,s) is the change of fluid forces projected by structure deformation.

Future of CFD in Biomedical Engineering Rapid developing of an outstanding major computational modelling and technological challenges, which directed towards the evolution of CFD has recognized by regulatory authorities. Excellent creative models and the development of novel applications for simulating complex fluid mechanics challenges in regards to human anatomy of cardiovascular and respiratory systems are now being progressively applied with the ability of CFD simulation programs. Therefore, it is important to demonstrate the effectiveness of simulations results relative to invasive measurement through observational trials, particularly in multicenter clinical studies. Clearly, that these methods will direct towards high potential to change clinical practice that benefits to patients, health providers and clinicians. The ability to predict accurately flows in the vascular and pulmonary system on a patientspecific basis has increased dramatically in the last years. We expect progress to continue in all the areas that encompass a comprehensive simulation capability: image segmentation, grid generation, flow solvers, fluid- structure interaction (FSI), data reduction and visualization. Some of the outstanding questions involve boundary conditions, material parameters (in particular for wall compliance), and the clinical significance of particular flow phenomena. At present, image-based, patient-specific computational hemodynamics can be used to:   

Study vascular diseases; Enhance diagnosis; and Plan surgical procedures.

Imaging modalities will continue to evolve and eventually both anatomy and physiology will be accurately visualized. However, the power of computer simulations lies in their ability to predict the outcome of procedures, i.e. the answer to 'what if' questions that can be useful for optimizing therapies80. Looking into the more distance future, we predict:  CFD enhanced radiology,  Simulations of long-terms effects, such as plaque formation,  Simulations of drug delivery and effects, and  The coupling of flow codes (continuum level) and particle codes (molecular level). Rainald Lohner, Juan Cebral, Orlando Soto, Peter Yim, James E. Burgess, “CFD in Medicine and Life Sciences Applications on the Living Human Being”, George Mason University, Fairfax, VA 22030-4444, USA. 80

74

Case Study 1 – CFD Simulation of Human Carotid Artery Bifurcation based on Anatomy and Volumetric Blood Flow rate Measured with MRI Hemodynamics and geometric variables play a crucial role in the appearance and progression of various vascular diseases. Specifically, the significance of Wall Shear Stress (WSS) and flow disturbances in the formation and rupture of an atherosclerotic plaque, which is the leading cause of stroke, is well acknowledged. Research shows that there is an increased chance of atheroma buildup around vessel bifurcations, where the blood flow is stagnant or highly disturbed. Additionally, WSS is proposed to be a controlling factor in the mechanism of plaque formation and rupture. These findings have persuaded researchers to grow an interest in the development of techniques that enable them to estimate WSS. Through phase-contrast (PC) magnetic resonance imaging (MRI) velocity measurements, qualitative and quantitative assessment of WSS is possible. Essentially, PCMRI measures the blood and tissue velocity at each point in the field of view. However, the relatively low spatial in-plane resolution of the image and difficulty in circumferential wall’s detection obstructs an accurate estimation of WSS. Another approach is to investigate WSS by employing the Computational Fluid Dynamic (CFD) simulation of the blood flow in patient specific models. Essential components of carrying out such simulations require accurate anatomic models, imposition of realistic boundary conditions, utilization of an appropriate viscosity model, and inclusion of the wall elasticity. Although accurate segmentation of blood vessels is a crucial step in analysis of blood circulation inside the body, realistic boundary conditions are suggested to be as important for accurately estimating the flow rate with three-dimensional CFD simulations. Approaches Several approaches have been proposed to impose physiologically realistic boundary conditions. A common boundary condition type is the resistance boundary condition. This boundary condition does not require any specification of flow rate or pressure at the outlet. However, the resultant flow and pressure waves are forced to be in the same phase in the resistance boundary condition, which violates the wave propagation phenomena. An alternative method is to use 1D method to solve the periodic blood flow in downstream vessels to provide boundary condition for the 3D computational domain81-82. Solution to a large number of downstream vessels, however, requires some simplifications, which leads to restriction of the method to periodicity of the blood flow. Therefore, a 0D model is proposed to track the dynamic nature of time dependent flow in human arteries. The 0D modeling approach utilizes the concept of a hydraulic-electrical analogue which is known as the [Windkessel model]83. By prescription of an impedance of the downstream vessels at the outlets, the Windkessel model will facilitate the imposition of realistic boundary conditions for 3D simulations of blood flow. While most hemodynamic simulations employ the Newtonian model for arterial flow, during the past decade several studies have suggested that appropriate nonlinear viscosity models should take an account of the key factors in hemodynamics simulations. Such shear rate dependent-viscosity models have been proposed in literature, most commonly using Power-law, Carreau-Yasuda, and Casson models. Regardless, the viscosity of the blood is dependent on several factors such as the hematocrit level. In contrast, other studies have suggested that the nonlinear effect is negligible in large arteries such as carotid arteries. Therefore, this study aims to investigate the nonlinear effect on Formaggia, L., Gerbeau, J.F., Nobile, F., Quarteroni, A., ” On the coupling of 3D and 1D Navier–Stokes equations for flow problems in compliant vessels”, Computer. Methods Appl. Mech. Eng. 191(6–7), 561–582 (2001) 82Lagana`, K., Dubini, G., Migliavacca, F., Pietrabissa, R., Pennati, G., Veneziani, A., Quarteroni, A. “Multiscale modelling as a tool to prescribe realistic boundary conditions for the study of surgical procedures”. Biorheology 39(3–4), 359–364 (2002) 83 Shi, Y., Lawford, P., Hose, R.: Review of zero-D and 1-D models of blood flow in the cardiovascular system. Biomed. Eng. OnLine. 10, 33–71 (2011). 81

75

hemodynamics factors and further improve the procedure to quantify uncertainties using PC-MRI measurements. Although several studies have shown that these simplifications are acceptable for hemodynamic simulation84, there is still need to perform simulations as realistic as possible. To model a realistic hemodynamic simulation in large arteries completely, the deformability of the arteries’ walls should also be considered. However, for the sake of simplicity, rigid walls are assumed here. Results and Discussion CFD analysis was performed using the computed wave forms as boundary conditions for a 1 million element mesh with time step of 0.005 s for four cardiac cycles. The heart rate was assumed 60 bpm for all the simulations. The simulations were performed for different viscosity models and the axial velocity contours on axial slices are represented in Figure 4.3 (AB) over one cardiac cycle. On the slice right below the bifurcation, negative axial velocity is present which implies that the blood is recirculating around this area. Moreover, the velocity profiles do not vary drastically from (B) Power Law (A) Newtonian model to model, as it can be observed in Figure 4.3 (A-B). However, near the recirculation zone where the velocity magnitude is the lower, slightly different contour lines can be noticed. The distribution of TAWSS on the carotid artery wall is demonstrated in Figure 4.3 (C-D). The difference between different viscosity models can be illustrated through WSS contours. For instance, near carotid sinus, where the carotid artery expands, dissimilar contour lines are observable. Besides, the lowest TAWSS is seen in this region. This is especially important for the researchers because carotid sinus is the location where the plaque accumulates. As expected, the highest WSS occurs near the junction points due to high velocity gradients. Similar to what was observed in other studies, (D) Power Law (C) Newtonian the results show that the blood flow is predominantly unidirectional during the cardiac cycle for the healthy patient, and therefore, Figure 4.3 Axial velocity and Time Average Oscillatory Shear Index (OSI) is almost zero everywhere on the whole circumference of the domain. Although the Power-law and Newtonian models are very similar to each other, the discrepancy in the TAWSS is more noticeable in the patient carotid artery model than that of the healthy subject, particularly around the plaque region. Near the outlet region of the ECA, the Newtonian model shows higher values. Figure 4.4 depicts the anatomical model and the velocity 84 Steinman, D.A.: Assumptions in modelling of large artery

hemodynamics. In: Ambrosi, D., uarteroni, A., Rozza, G. (eds) Modeling of physiological flows, pp. 1–18. Springer, Milan (2012)

76

contours for the patient model. It indicated an anatomic model for the patient with carotid artery plaque (Left) and axial velocity at peak systole (Right). Since the geometry is obviously more twisting, the blood flow is more unpredictable in this case. Starting from the inlet, the velocity profile is almost parabolic at the beginning. As the flow approaches the plaque and bifurcation region, it follows the geometry of the unhealthy carotid artery. Recirculation regions appear near the carotid sinus throughout the cardiac cycle. In addition, low velocity flow occurs near the carotid bifurcation and carotid sinus. However, the flow seems to be fully developed superior to the ICA.

Figure 4.4

Anatomic Model for the Patient with Carotid Artery Plaque

Case Study 2 - CFD Analysis of the Effect of Plaques in the Left Coronary Artery

Coronary Artery Disease (CAD) is the leading cause of death in advanced countries. The most common cause of CAD is atherosclerosis which is caused by the presence of plaques on the artery wall, resulting in the lumen stenosis85. Plaques have been particularly associated with blood clots and compromise blood flow to the myocardium. This occurs when the coronary plaques suddenly rupture; if a clot cannot be treated in time, then the heart muscle will be impaired due to ischemic changes, leading to myocardial ischemia or infarction or, more severely, necrosis. Therefore, an early detection and diagnosis of CAD is particularly important for reduction of the mortality and subsequent complications86. The natural history of coronary plaque is dependent not only on the formation and progression of atherosclerosis, but also on the vascular remodeling response. If the local wall shear stress is low, a proliferative plaque will form. Local inflammatory response will stimulate the formation of so-called “vulnerable plaque” which is prone to rupture with superimposed thrombus formation. The vast majority of these inflamed high-risk plaques cannot be detected by anatomic and myocardial perfusion imaging. Since the progression and development of 85 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect

of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 86 Australian Institute of Health and Welfare, “The tenth biennial health report of the Australian Institute of Health and Welfare,” AIHW, Canberra, Australia, 2006.

77

vulnerable plaque is associated with low wall shear stress and the presence of expansive remodeling, measurement of these characteristics in vivo will enable risk stratification for the entire coronary circulation87. Figure 4.5 shows a 3D CT visualization of a normal left coronary artery with side branches in a patient with suspected coronary artery disease. The wall shear stress (WSS), wall pressure, and blood flow changes in the human body cannot be measured directly on blood vessels, whereas computational fluid dynamics (CFD) can provide alternative ways to diagnose CAD88. The WSS factor in the coronary artery is known to play a significant role in the early formation of CAD. In addition, the WSS at the local vessel wall can demonstrate a predisposition for atherosclerosis development for various Figure 4.5 3D CT visualization of a normal left anatomical sections, thus enabling the coronary artery with coronary artery disease prediction of coronary disease. CFD allows for efficient and accurate computations of hemodynamic features of both normal and abnormal situations in the cardiovascular system, in vivo simulation of coronary artery flow changes. CFD is different from medical imaging visualization as medical imaging techniques such as coronary angiography or computed tomography angiography provide anatomic alterations of the coronary artery wall due to the presence of plaques, thus allowing only assessment of the degree of lumen changes such as stenosis or occlusion. In contrast, CFD analysis enables the identification of hemodynamic changes in the coronary artery, even before the plaques are actually formed at the artery wall or can occlude the vessels. Therefore, to some extent, CFD allows early detection of coronary artery disease and improves the understanding of the progression of plaques, which are considered of paramount importance to clinical treatment. The purpose of this study was to investigate the hemodynamic effect of plaques in the left coronary artery by using CFD analysis. Simulated plaques were inserted into the left main stem and left anterior descending coronary arteries (taken from a selected patient’s data), and hemodynamic analysis was performed to correlate the effect of presence of plaques with subsequent flow changes to the coronary main and side branches. Patient Data Selection for Generation of Left Coronary Artery Model A sample patient suspected of CAD who underwent multi slice CT angiography was selected, and the patient’s volume CT data was used to generate a 3D coronary model. The original CT data was saved in digital imaging and communication in medicine (DICOM) format and then transferred to a workstation equipped with Analyze 7.0 (Analyze Direct, Inc., Lexana, KS, USA) for image afterprocessing and segmentation. Three-dimensional (3D) volume data was post-processed and 87 F.

J. Rybicki, S. Melchionna, D. Mitsouras et al., “Prediction of coronary artery plaque progression and potential rupture from 320-detector row prospectively ECG-gated single heart beat CT angiography: lattice Boltzmann evaluation of endothelial shear stress,” International Journal of Cardiovascular Imaging, vol. 25, no. 2, pp. 289– 299, 2009. 88 S. K. Shanmugavelayudam, D. A. Rubenstein, and W. Yin, “Effect of geometrical assumptions on numerical modeling of coronary blood flow under normal and disease conditions,” Journal of Biomechanical Engineering, vol. 132, no. 6, article 061004, 2010.

78

segmented using a semiautomatic method with a CT number thresholding technique89-90, and manual editing was performed in some slices to remove soft tissues and artefacts. The segmented model was produced with a special focus on the Left Coronary Artery (LCA) and its branches. The 3D LCA model was saved in “STL format” for further reconstruction purposes. Figure 4.5 shows the anatomical details of the left coronary artery91. Realistic Plaques Modelling The actual plaques and degree of lumen stenosis on coronary artery wall were simulated at the Left Main Stem (LMS) and the Left Anterior Descending (LAD), as these artery branches are the common locations where plaques tend to form and induce myocardial ischemic changes. The plaques produced a lumen narrowing of approximately 60% diameter at the LMS and LAD, since more than 50% lumen stenosis leads to significant hemodynamic changes to flow within the coronary artery. Figure 4.6 is the segmented LCA model showing various views of the position of the plaques at the left coronary artery. Double arrows indicate that rectangle is an effective plaque location (EPL). Generation of Computational Models The surface of LCA model with and Figure 4.6 Plaque distribution in left coronary artery Model without plaques was prepared by using Blender version 2.48 (Blender Institute, Amsterdam, Netherlands). A gentle B-spline smoothing technique was applied between the left main trunk and side branches to reduce any potential nonphysical behavior induced by sharp edges. The surface models consisting of plaques and normal coronary arteries were converted into solid models and saved in “STL format” for the additional creation of meshing elements. Both models were used to create hexahedral and tetrahedral meshes to perform the CFD simulations. The hexahedral mesh configuration for the LCA model without plaques was 949,289 elements and 1,062,280 nodes, while the hexahedral mesh configuration for the LCA model with plaques was 928,311 elements and 1,041,936 nodes. The tetrahedral mesh configuration was 15,519 nodes and

Z. Sun, R. J. Winder, B. E. Kelly, P. K. Ellis, and D. G. Hirst, “CT virtual intravascular endoscopy of abdominal aortic aneurysms treated with suprarenal endovascular stent grafting,” Abdominal Imaging, vol. 28, no. 4, pp. 580–587, 2003. 90 Z. Sun, R. J. Winder, B. E. Kelly, P. K. Ellis, P. T. Kennedy, and D. G. Hirst, “Diagnostic Value of CT Virtual Intravascular Endoscopy in Aortic Stent-Grafting,” Journal of Endovascular Therapy, 2004. 91 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 89

79

78,618 elements. The meshes were generated using ANSYS ICEM CFD®, with details having been described in previous studies92. Application of Physiological Parameters In order to ensure that our analysis reflects the realistic simulation of in vivo conditions, realistic physiological boundary conditions were applied for 3D numerical analysis. The transient simulation was performed using accurate hemodynamic rheological and material properties, as described in a previous study93. Pulsatile velocity was applied as an inlet boundary condition at the left main stem, and a zero pressure gradient was applied at the left anterior descending and left circumflex outlet boundaries94. Appropriate rheological parameters were applied with a blood density of 1060 kg/m3 and blood viscosity of 0.0035 Pa. The blood flow was assumed to be laminar and a no-slip condition was applied at the walls. Plaque was assumed to be a rigid body. Blood was assumed to be a Newtonian and incompressible fluid. In addition, the comparison of WSS between Newtonian and non-Newtonian models has been considered, especially at the stenotic locations. A non-Newtonian blood model was simulated using the generalized power law as n(y) 1

μ  λ(y) y   λ(y)  μ   Δμ exp  1      n(y)  n   Δn exp  1   

  b  y   exp   y  a   

Eq. 4.6

  d  y   exp      c  y 

where μ∞ = 0.035, n∞ = 1.0, Δμ = 0.25, Δn = 0.45, a = 50, b = 3, c = 50, and d = 4. Generalized power law model is defined as fits experimental stress-strain measurements over the range of strain rates, y´, 0.1 < y´ < 1000 s−1. Performance of Computational Hemodynamic Analysis The Navier-Stokes equations were solved using the ANSYS CFX CFD package on a Microsoft Windows 32-bit machine, 6MB RAM with an Xeon W3505 2.53 GHz CPU. The CFD simulation was run for 80 time steps, representing 1.0 second of pulsatile flow, (0.0125 seconds per time step), with each time step converged to a residual target of less than 1×10−4 by approximately 100 iterations. The CFD solution was fully converged by approximately 8,000 time iterations per LCA model. The calculation time for each LCA model was approximately 2 hours. Flow velocity, cross-sections of velocity pattern, and pressure gradient were calculated and visualized using ANSYS CFD. Figure 4.7 represents the area of interest at the left coronary bifurcation and shows measurement positions of cross-sections of the models with and without plaques. The sectional planes were separated into 3 groups: Sections A–E, Sections F–J, and Sections K–O. The distance between sections in each group was approximately 0.5 millimeters. The parameter used to characterize the impact of plaques at the coronary bifurcation T. Chaichana, Z. Sun, and J. Jewkes, “Computation of hemodynamics in the left coronary artery with variable angulations,” Journal of Biomechanics, vol. 44, no. 10, pp. 1869–1878, 2011. 93 T. Frauenfelder, M. Lotfey, T. Boehm, and S. Wildermuth, “Computational fluid dynamics: hemodynamic changes in abdominal aortic aneurysm after stent-graft implantation,” Cardiovascular and Interventional Radiology, vol. 29, no. 4, pp. 613–623, 2006. 94 E.Wellnhofer, J. Osman, U. Kertzscher, K. Affeld, E. Fleck, and L. Goubergrits, “Flow simulation studies in coronary arteries-Impact of side-branches,” Atherosclerosis, vol. 213, no. 2, pp. 475–481, 2010. 92

80

on hemodynamic flow was calculated as the magnitude of local pressure gradient, which is defined as 2

 p   p   p  PSG           x   y   z  2

2

Eq. 4.7

where p is the pressure in the area of interest. The local PSG is calculated by taking the time derivative of the local pressure. Finally, the value of PSG oscillated in relation to the percentage of plaques in the coronary lumen. CFD Results of the Left Coronary Artery The realistic left coronary artery models with plaques and without plaques were successfully performed with CFD analysis under in vivo physiological conditions during the systolic and diastolic phases. Peak systolic velocity and pressure were reached at a time of 0.4 sec, and diastolic phase was reached at a time of 0.7 sec during the cardiac cycles, respectively. The analysis demonstrates a strong relationship between hemodynamic change and plaques at the left coronary artery. Figure 4.7 The EPL Posterior View at left 4.9.6.1 Cutting Plane Visualization Coronary Artery Flow velocity was visualized inside the LMS at sections A–E, as shown in Figure 4.7. Flow patterns in both the pre and post stenotic cases were similar to those observed in Sections A and B (velocity ranged from 0 to 17.43mm/s). However, the flow velocity increased in Sections C–E (velocity ranged from 23.96 to 30.50mm/s), at the location of plaques during the systolic peak. In addition, the flow pattern was affected by the presence of plaques, which started from sections A–E as observed in the post stenotic region, during the diastolic phase, with velocity increasing from 28.32 to 30.50mm/s. For cutting views of sections F–J and K-O., please refer to95.

95 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect

of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367.

81

(a) post - plaque

(c) post - plaque

t = 0.4 s

t = 0.7 s

(b) pre - plaque

(d) pre - plaque

t = 0.4 s

t = 0.7 s

Figure 4.8

Flow velocity observed in pre and post plaque simulated models

4.9.6.2 Wall Shear Stress (WSS) Comparisons Analysis of WSS was particularly focused at the stenotic locations with comparison of non-Newtonian and Newtonian fluid models. Figure 4.9 compares WSS with different fluid viscosities at the left coronary model with presence of plaques. Comparison of WSS between non-Newtonian (a,c) and Newtonian (b,d) models observed in realistic coronary artery with presence of plaques. WSS contour values ranged from 0 Pa to 3.50 Pa as observed in both fluid viscosity models. WSS was different due to presence of plaques at LMS branch at peak systolic phase, ranging from 0.50 Pa to 1.75 Pa with non-Newtonian model and ranging from 0.50 Pa to 1.0 Pa with Newtonian model (Figure 4.10-(b)). Similar results of WSS values ranging from 1.50 Pa to 3.50 Pa with both viscosity models (Figure 4.10 (c -d)) were found at diastolic phase at plaques positions in LMS branch. WSS changes at stenotic locations in LAD were compared at peak systolic phases, ranging from 0.50 Pa to 1.0 Pa with non-Newtonian model (see Figure 4.10 (a)) and from 0.50 Pa to 0.75 Pa with Newtonian model (Figure 4.10 (b)). WSS values at plaques positions in LAD were compared at diastolic phases, ranging from 1.50 Pa to 3.50 Pa with non-Newtonian model (Figure 4.10 (c)) and from 1.50 Pa to 3.25 Pa with Newtonian model (Figure 4.10 (d)).

82

Discussion This study shows that coronary plaques produce a significant impact on the subsequent flow changes in the coronary artery, in addition to the local hemodynamic interference due to the presence of plaques. This is clinically important as further potential effects could result from the plaques’ interference, leading to adverse effects on the coronary artery, such as lumen stenosis or worsening of atherosclerosis. It is well known that plaques most commonly form in the coronary bifurcation and coronary angulation, and that this is an important factor that has been found to be related to the development of atherosclerosis, as confirmed by our and other studies. Multi slice CT angiography and intravascular ultrasound have been widely used to detect and characterize plaques in the coronary arteries. Despite promising results having been achieved with imaging modalities, the limitations of these techniques were restricted to image visualization and identification of coronary lumen changes due to presence of plaques, and no information is available about the interference of plaques with blood flow. In contrast, CFD overcomes those limitations by enabling the analysis of coronary blood flow and rheological factors. This study investigated two important factors: PSG and flow velocity and qualified the impact of Figure 4.9 Cross-sectional views of A–E at the left main plaques on flow changes to the stem coronary arteries. The static wall pressure does not reflect the velocity profile from the flow axis to the blood wall. In the clinical situation, the PSG magnitude has been used to judge the risk of severity of plaques. The highest PSG area may be relevant to potential coronary plaque rupture. In this study, the CFD analysis of the LCA with presence of plaques showed that the highest PSG was displayed in the locations at both LMS and LAD where plaques were simulated (see Figure 4.9), with measured PSG value ranging from 743.21 to 800 kg/m2s2. The presence of plaques in the coronary artery is responsible for obstructing blood flow to the myocardium, consequently affecting the flow velocity. Moreover, plaques influencing hemodynamic change may lead to the further distribution of plaques. Since velocity is the main component of local WSS and acts in the same direction as local WSS, which means that flow velocity is low when the WSS is low, as observed in a previous study, our analysis in this study has proposed explicitly hemodynamic changes inside the LCA surrounding the plaque locations the so-called Effective Plaque Location (EPL). In Sections A–E, we found that the flow velocity fluctuated in post-stenotic regions during cardiac cycles, and this could lead to abnormalities at the coronary wall, responsible for atherosclerosis. In Sections I–L , flow recirculation occurred, and the region of low velocity was observed within a short distance from the plaques. Consequently, plaques could generate an effect that spread into an area of low flow velocity as demonstrated in Sections I–L, matching with an area of low velocity in Error! Reference source not found., with measured low velocity value ranging from to 2.18mm/s. This is confirmed by our previous analysis showing that progression of plaques developed at a low flow region. Our analysis provides insight into the effect of plaques on subsequent coronary flow changes although further studies are needed to verify our preliminary findings. WSS

83

in non-Newtonian model was found to be similar to that observed in Newtonian model at plaques locations although more details were demonstrated in non-Newtonian model, as shown in Figure 4.10. The effect of plaques in left coronary is obviously shown in Newtonian model, and this is adequate for analysis of the plaque effect. The comparison of WSS between different viscosity models is confirmed by previous studies. A non-Newtonian model was simulated using the generalized power law as it has been reported to produce similar WSS effects to Newtonian model on coronary flow changes96.

(a) non-Newtonian

(c) non-Newtonian

t = 0.4 s

t = 0.7 s

(b) Newtonian

(d) Newtonian

t = 0.4 s

t = 0.7 s

Figure 4.10

Comparison of WSS between non-Newtonian and Newtonian Models Observed in Coronary Artery with Presence of Plaques

96 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect

of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367.

84

Limitation There are some limitations in our study that should be addressed. Firstly, realistic left coronary models, both pre and post-stenotic, were assumed to have a rigid wall rather than elastic wall; therefore, the simulation does not fully reflect the realistic physiological situation as the coronary wall moves during cardiac cycles. Secondly, the assumption of a Newtonian blood model becomes important especially in low flow and low wall shear stress regions. Nevertheless, a previous study has shown that the assumption of a Newtonian model is reasonable in this configuration. Thirdly, the realistic plaques position may be affected by left coronary side branches that have not been evaluated in this study. Thus, future studies will use coronary models with a more realistic idealized geometry, extended to evaluate the effect of side branches. In conclusion, we studied the effect of simulated plaques in the realistic left coronary artery on hemodynamic changes at the locations of plaques, as well as pre and post-stenotic regions inside the coronary artery. There is a direct effect of plaques in the left coronary artery on hemodynamic changes such as recirculation flow, low flow velocity regions, wall shear stress, and wall pressure gradient, indicating the potential for plaques to rupture, causing atherosclerosis. Further studies focusing on the realistic plaque’s effect on coronary side branches should be performed to verify our results97.

97

See previous.

85

5 Mesh Free Methods for CFD While algorithms have seen great advances in CFD, mesh generation methods has lagged behind, creating a computational bottleneck. For industry and government looking to impact current and future products with simulation technology, mesh generation imposes great challenges. Many generation procedures lack automation, requiring many man-hours, which are becoming far more expensive than computer hardware. More automated methods are less reliable for complex geometry with sharp corners, concavity, or otherwise complex features. Most mesh generation methods to date require a great deal of use expertise to achieve proper stretching, resolution, and structure98. The motivation behind meshless methods lies in releasing the burden of mesh generation. Since the application of computational methods to real world problems appears to be paced by mesh generation, alleviating this bottleneck potentially impacts an enormous field of problems. It is not clear at this point how effective meshless methods will be at alleviating meshing problems. While a rigid mesh is not required, sufficiently dense point distributions are still required. Moreover, points must be grouped locally to form clouds. Obtaining optimal clouds for different methods is also a nontrivial problem. However, recent progress in the area of point distribution and cloud generation by L¨ohner and others 99-100 has shown great promise is this area. Several of the most notable meshless methods are:      

Smooth Particle Hydrodynamics (SPH) Mesh free Local Petrov-Galerkin (MLPG) Methods based on Radial Basis Functions (RBF) Finite Point Methods (FPM) Mesh free Boundary schemes Reproducing Kernel Particle Method (RKPM)

These methods are also summarized in works by Liu101 and Liu and Gu102.

Smooth Particle Hydrodynamics (SPH)

The method of SPH, introduced by Monaghan103, makes use of an integral representation of a function at a point given a set of surrounding points, called a kernel approximation. It uses no mesh, and points are free to move past one another consistent with a Lagrangian approach. While SPH was first developed to handle astrophysical phenomena in open space, the method was later applied to structures, fracture simulation, fluid flow, and other fields. Monaghan 104 showed that the SPH method with artificial viscosity could accurately capture shock waves in one-dimensional shock tube problems. Methods based on an SPH formulation are well-suited for problems of infinite domain in which the problem size is not know in advance. Aaron Jon Katz, “Meshless Methods for Computational Fluid Dynamics”, A dissertation submitted to the department of aeronautics and astronautics and the committee on graduate studies of Stanford university in partial fulfillment of the requirements for the degree of doctor of philosophy, January 2009. 99 R. L¨ohner and E. O˜nate. An advancing front point generation technique. Communications in Numerical Methods in Engineering, 14:1097–1108, 1998. 100 R. L¨ohner, C. Sacco, and E. O˜nate. A general advancing front technique for filling space with arbitrary objects. Int. J. Numerical Meth. Engineering. 61:1977–1991, 2004. 101 G. R. Liu. Mesh Free Methods: Moving Beyond the Finite Element Method. CRC Press, 2003. 102 G. R. Liu and Y. T. Gu. An Introduction to Mesh free Methods and Their Programming. Springer, 2005. 103 J. J. Monaghan and R. A. Gingold.”Shock simulation by the particle method SPH”, Journal of Computational Physics, 52:374–389, 1983. 104 See previous. 98

86

While SPH has become popular for intensely dynamic problems in which a static or even dynamic mesh may not properly resolve relevant physics, certain implementation difficulties are inherent in the method. These difficulties include the selection of a proper domain of influence with weighting functions, efficient nearest neighbor particle searching, and the determination of a smoothing length for force computations at each particle. The Reproducing Kernel Particle Method (RKPM), introduced by Liu, Jun and Zhang105, is very similar to the SPH method in that it uses a finite integral representation to discretize the governing PDEs. However, RKPM adds a correction function to the base kernel approximation, improving the accuracy especially near boundaries106. The RKPM method has been applied to fluids, structures, and acoustics. Lesoine and Kaila107 used RKPM to compute aero elastic effects of aircraft with large control surface deflections. Zhang, Wagner, and Liu108 showed that RKPM was well suited for domain decomposition for large-scale parallel computing. Mesh free Local Petrov-Galerkin The MLPG method has arisen from the finite element community and is based on the weak form of a given PDE. While the use of the weak form of PDEs relaxes consistency requirements of field variable approximation, many algorithms in CFD bypass the rigorous use of weak forms. Weak forms require the use of numerical integration since they satisfy global integral forms of the governing equations. Numerical integration, along with other rigorous aspects of weak forms makes them computationally inefficient compared with simple FDM or FVM approaches. Jameson109 showed the equivalency of one FVM scheme with a Galerkin method, most development in CFD has been based on strong forms of the governing equations, which lead to simple and efficient conservative schemes. Nonetheless, an immense mathematical foundation has been developed based on weak forms used for a variety of FEM applications. Developed by Atluri and others 110-111, the MLPG method is based on a Petrov-Galerkin formulation in which weight and trial functions used in the weak form of the equations need not be the same. This gives the method a “local” nature in which the integral in the weak form is satisfied over a local domain. The MLPG method thus requires a local “background grid” to perform the integral as demanded by the weak form. However, the integral is performed locally, relieving the need for a global background integration as is used in related methods. The local background grid may be simple shapes, such as circles or squares. By all practical measures, MLPG is essentially meshless. Approximation of the field variables for the MLPG method is constructed using a moving least squares approach. Least squares representations of a function do not pass through the discrete sampling points of the function. Instead, they construct a smooth representation which minimizes the error of approximation. This fact has posed some difficulties in obtaining accurate and stable boundary conditions for the MLPG approach. The MLPG scheme is very general and has been applied to various problems. Specific to fluid mechanics, it have used MLPG to solve the incompressible W. K. Liu, S. Jun, and Y. F. Zhang. ”Reproducing kernel particle methods”, International Journal for Numerical Methods in Fluids, 20:1081–1106, 1995. 106 F. C. Gunther and W. K. Liu. “Implementation of boundary conditions for meshless methods”, Computer, Methods Appl. Mech. Engineering, 163:205–230, 1998. 107 M. Lesoinne and V. Kaila. “Meshless aero-elastic simulations of aircraft with large control surface deflections”, AIAA paper 2005-1089, AIAA 43rd Aerospace Sciences Meeting and Exhibit, Reno, NV, January 2005. 108 L. T. Zhang, G. J. Wagner, and W. K. Liu. “A parallelized mesh free method with boundary enrichment for largescale cfd”, Journal of Computational Physics, 176:483–506, 2002. 109 A. Jameson, T. J. Baker, and N. P. Weatherill. “Calculation of inviscid transonic flow over a complete aircraft”, AIAA paper 1986-0103, AIAA 24th Aerospace Sciences Meeting, Reno, NV, January 1986. 110 S. N. Atluri and T. Zhu. “A new meshless local petrov-galerkin (mlpg) approach in computational mechanics”, Computational Mechanics, 22:117–127, 1998. 111 S. N. Atluri, H. G. Kim, and J. Y. Cho. “A critical assessment of the truly meshless local petrov-galerkin (mlpg) and local boundary integral equation (lbie) methods”, Computational Mechanics, 24:348–372, 1999. 105

87

Navier-Stokes equations used an up winding scheme for stabilization of the convection operator in the stream wise direction. Mesh free Methods Based on Radial Basis Functions Radial basis functions are functions which have no preferred direction, but only depend on norms in space. Most often, the Euclidean distance is used as the norm. Common RBFs include Gaussians, thin plate splines, and multi-quadrics. In general, RBFs are smooth and continuously differentiable. When used for interpolation purposes, RBF approximations are constructed such that they pass through data points exactly. It is difficult to prove any order of accuracy of such approximations since RBFs are not based on Taylor series or polynomial expansions. While RBFs have been widely used in scattered data interpolation, their application to the solution of PDEs is relatively new. The symmetric and un-symmetric forms were compared independently by and compared an RBF method to the finite element method in terms of accuracy and efficiency, showing improved accuracy of the RBF method over FEM. Sharan has used the popular multi-quadric RBFs to solve elliptic PDEs. In a similar work, [Sarler] formulated a solution method for diffusion problems based on RBFs. In a more general work, integrated the theory of Galerkin methods with radial basis functions. More recently, [Divo and Kassab] have used RBFs to model convective viscous flows and heat transfer problems. [Chinchapatnam] has used a localized RBF method to compute incompressible viscous flows. Radial basis methods for compressible flows are much less common, however Shu has recently proposed such a method based on an upwind approach. Finite Point Methods By far, the most prevalent meshless schemes for CFD have been the so-called finite point methods. Finite point methods are usually based on the strong form of the governing PDEs and have given rise to several variants. In general, FPMs are based on least squares fitting of functions to discrete points. These approximate functions form the basis of discretization methods for PDEs. Least squares techniques have been widely used in traditional CFD methods as a means of reconstructing high order solutions, as discussed by [Mavriplis]112. However, the use of least squares as the primary mechanism for PDE discretization in the meshless sense is relatively new. Finite point methods were originally derived as generalizations of FDM for irregular point distributions by [Chung]113. Finite point methods may be categorized into two main classes: methods derived from Taylor series, and methods based on polynomial basis functions. Actually the Taylor series approach is a specific case of a polynomial method in which the approximated function is constrained to pass through the local cloud center. The Taylor approach is intuitive and has formed the basis for many schemes, including the Least Squares Kinetic Upwind Method (LSKUM). Other approaches based on Taylor series expansions includes the order of accuracy of the Taylor method for an upwind scheme. The methods based on polynomial basis functions are equally numerous as the Taylor based methods. [Batina] was one of the first to use a polynomial basis in conjunction with least squares to compute derivatives for the Euler and Navier Stokes equations. He used an unweighted least squares approach. A similar method was proposed a few years later by [Liu and Su]. Others developed a more rigorous method based on polynomial basis functions. Their method incorporated different least squares weighting methods to improve the accuracy of derivatives and formulations for higher order methods. They applied their method to subsonic compressible inviscid and viscous flows. [L¨ohner and others] extended the method of O˜nate to compressible aerodynamic applications with shocks in three dimensions. They implemented their scheme with the van Leer approximate Riemann solver , gradient reconstruction for high resolution, and limiters to capture shocks. D. J. Mavriplis. “Revisiting the least-squares procedure for gradient reconstruction on unstructured meshes”, AIAA paper 2003-3986, AIAA 16th Computational Fluid Dynamics Conference, Orlando, FL, June 2003. 113 K. C. Chung. “A generalized finite-difference method for heat transfer problems of irregular geometries”, Numerical Heat Transfer, 4:345–357, 1981. 112

88

Meshless Boundary Schemes Many of the methods discussed above have been used to enforce boundary conditions for embedded boundary systems. Embedded boundaries arise with the use of nobody-conforming grids, such as Cartesian grids. Meshless methods have been used in place of cut cells and other related methods. One of methods used is polynomial least squares method to compute inviscid slip boundary conditions using embedded Cartesian meshes. They presented encouraging results for two and three dimensional inviscid test cases. It has been implemented meshless embedded boundary conditions for high Reynolds number viscous flows using the concept of a sub-grid to resolve boundary layers. The sub-grid adds additional resolution near the surface, providing points on which to perform meshless computations. All these methods appear to provide attractive alternatives to Cartesian cut cells or other methods of embedded boundary conditions114.

Solution Procedure for Mesh free Methods 115 The procedure of mesh free methods consists of four basic steps:    

Domain representation Function approximation Formation of system equations Solving the global equations

Domain representation First, the domain and its boundary is modeled (not discretized) using sets of arbitrarily distributed nodes (see Figure 5.1) in the domain and its boundary. The nodal distribution is usually not uniform. The density of nodes depends on the accuracy requirement of the analysis. Because the nodes carry the values of a field variable (e.g. density, velocity, etc.), they are often called field nodes. Further in the text, a field variable will be referred to as a field function.

Figure 5.1

Domain representation

Function Approximation The field function u at any point at x = (x, y) within the domain is approximated using the values at its nodes within the “small” local domain of the point x, i.e. n

u(x)   i (x)u i i 1

Eq. 5.1

Where n is the number of nodes included in a local domain of the point at x, u i is the nodal field function at the i th node in the local domain, and ϕi (x) is the shape function of the i th node. The “small” local domain of x will be called the support domain of x and denoted Ωx. The size of support domain defines the number of field nodes approximating x. Some possible shapes of support domains are shown in Figure 5.2 where spherical is the most common one.

For excellent survy of literature in “Meshless methods”, see 77. P. Niedoba, L. Cˇerma, and M. J´ıcha, Meshfree methods for computational fluid dynamics, EPJ Web of Conferences 45 01068 (2013), DOI: 10.1051, epjconf/201345 01068. 114 115

89

Formation of System Equations System equations can be formulated using the shape functions and strong or weak formulation116. These equations are assembled into the global system matrices for the entire problem domain. For static problems, the global system equations are a set of algebraic equations. For general dynamics problems, it is a set of differential equations. Solving the Global Equations The last step depends on the type of equations (algebraic, differential, etc.). Note that the global equations for computational fluid dynamics problems are basically nonlinear. Figure 5.2

Method of Smooth Particle Hydrodynamics (SPH)

Different type of Support domains

The smoothed particle hydrodynamics method belongs to basic mesh free methods. It is used for solving partial differential equations. The SPH is basically an interpolation method. The interpolation is based on the theory of integral interpolants using kernels that approximate a delta function. The fluid mass is lumped into smoothed blobs that are moved using Newton’s second law directly, without an underlying mesh. In SPH the fluid is modeled as a collection of smooth “blobs” or particles as depicted in Figure 5.3. A system of ordinary differential equations is produced after approximation of unknown functions (field function) and their spatial derivatives. This system is Figure 5.3 1-D SPH Characterization most often solved by explicit numerical methods. Formulation Function approximation of the field function u(x) is based on an integral representation of the function and is given by the equation

 f(x) 

 f( )W(x  ξ, h)dξ

Ωx

Eq. 5.2

Where W(x-ξj , h) is the weight function (i.e., smoothing function, kernel function), h being the smoothing length, which defines the size of the support domain Ωx, i.e. the smoothing length determines the number of particles approximating the function at x. Eq. 5.2 is usually referred to as kernel approximation, or SPH approximation of function f(x). For practical calculation, Eq. 5.2 must 116

See 94.

90

be discretized as follows n

n

mj

j 1

j 1

ρj

 f(x)    f( j )Vj W(x  ξ j , h)  

f( j )W(x  ξ j , h)

Eq. 5.3

Where mj and ρj are mass and density of the jth particle in Ωx (i.e., VJ = mj/ρj is the volume of j- particle). Eq. 5.3 is called a particle approximation of field function f(x). Note that the approximation ( Eq. 5.3) corresponds to the approximation (Eq. 5.1) introduced for a general mesh free method. The shape function in this case has the form of

 j (x)  W(x  ξ, h) j

mj

Eq. 5.4

ρj

Approximation of the spatial derivatives of the field function can be obtained by replacing the function f(x) in Eq. 5.2 with its spatial derivative ∇f(x). Using the per-parts, the Green theorem and a discretization we obtain a particle approximation of the spatial derivative of the field function in the form of n

mj

j1

ρj

 f(x)    f(ξ j ) x W(x  ξ j , h)

Eq. 5.5

Where ∇xW(x −ξ j , h) is the spatial derivative of the weight function with respect to the variable x. We can observe that an approximation of the spatial derivative of a field function is determined using only field function values and derivatives of the weight function. In the same fashion we obtain the Laplacian as: n

mj

j 1

ρj

  f(x)    f( j ) 2x W(x  ξ j , h) 2

Eq. 5.6

Smoothing Kernels The use of different kernels in SPH is analogue to using different difference schemes in finite difference methods, thus the choice of smoothing kernel for a specific problem is significant. The derivatives of the smoothing kernels have an important impact for different SPH estimations, but we will now focus on the kernels and their required properties. It is required that a suitable kernel must have the following two properties,

 W(x  ξ, h)dξ  1

and

Ωx

 x 0 lim h 0 W(x -  , h)   (x -  )    0 otherwise Eq. 5.7

Eq. 5.7 states that the kernel must be normalized, and that the unit integral ensures that maxima and minima are not enhanced. The kernel must also be positive to ensure that it is an averaging function.

91

If the kernel is even, then rotational symmetry is enforced, which is useful to ensure invariance under rotations of the coordinate system.

W(x - ξ, h)  0 and W(x - ξ, h)  W(x - ξ, h)

Eq. 5.8

If these conditions are met, the interpolation is of second order accuracy117 that is the error of approximating is 2nd order or better. It is also suggested that a suitable kernel should have a limited or compact support radius, in order to ensure zero kernel interactions outside the computational

Gaussian Kernel

Figure 5.4

The choice of Different Smooth Kernel in 1D (h=1)

range of the radius. We use the kernel width h as the compact support radius for all smoothing kernels, which implies W(x-ξ, h) =0, r>h. The first golden rule of SPH states that if a new interpretation of an SPH equation is to be found, it is always best to assume the kernel is a Gaussian118. The isotropic Gaussian kernel in n dimensions is given by

W(x  ξ, h) 

1

2πh 

2 3/2

e

 r2  2  2h 

   

,

h0

Eq. 5.9

Which is depicted in Figure 5.4 - left. Even though a Gaussian kernel has very nice mathematically properties, it is not always the best kernel to use, e.g. it does not have a compact support for our purpose, and it requires the evaluation of the expensive exponential function. There are other choices of kernels such as W1=piece wise cubic spline, W2=quadratic spline, and W3=exponential function which also shown in Figure 5.4 - right, where d = (ξ - ξj)/h.

J. J. Monaghan. “Smoothed Particle Hydrodynamics”. Annual Review of Astronomy and Astrophysics, 30, pp. 543-574, 1992. 118 See above. 117

92

Updating of Smoothing Length h To update h, we can use either the constant or as a variable. 5.3.3.1

Constant

 h too small, n too small, results no accurate  h too big, local information smoothed out 5.3.3.2

Variable

 known at the beginning: hi0  updated solving:

Figure 5.5

Ghost Particles, Velocities are formed Symmetrically (slip wall)

 min m nj  n Dhin h in Dρ in h in n  n   n  m j  n  n . i W(ξ i  ξ j ) ρ Dt ρi N Dt ρ i N j1 ρ j   i

Eq. 5.10

Where N is number of dimensions. It is ok for slow varying density, more complicated procedure or fast expansion/contraction (e.g. in gases)119. Boundary Treatment The issue of boundary conditions is generally very difficult in the SPH method. We answer the question of properly defining the boundary condition that prevented particles from escaping out of the domain. Furthermore, we discuss consistency near the boundary of the domain (near boundary area)

Figure 5.6

Virtual Particles

Virtual Particles The first approach is the use of virtual particles. These particles are situated on the boundary and by repulsive force acting on the particles in the near boundary area (near boundary particles). Hence, virtual particles prevent an unphysical penetration through the boundary. (see Figure 5.6). Unfortunately, this approach violates the condition for C1 consistency of the SPH Figure 5.7 Example of a 1D task, particle j is Situated in the Near Boundary Area approximation in the near boundary area. This fact is due to the undesirable “cutting off” of the weight function support, see Figure 5.7. Thus, the appropriate weight function is not an

119

Remo Minero, “Mesh Free Methods for Fluid Dynamics Problems”, 17 Dec, 2003.

93

even function120. Ghost Particles A much better way is to use ghost particles as a boundary condition. In contrast to virtual particles, this approach creates a dynamic wall that is constructed at each time step. Ghost particles are formed symmetrically (according to the boundary) to the near boundary particles as “twin” particles, see Figure 5.5. Using ghost particles ensures C1 consistency of the SPH approximation, because the shape functions of the near boundary particles can be even functions. Summery and Recap121  Smoothed particle hydrodynamics is an interpolation method that can approximate continuously field quantities and their derivatives by using discrete sample points, called smoothed particles.  Particles carry mass, m, position, x, and velocity, u, but can also hold SPH estimated quantities, e.g. mass-density, ρ, pressure, p, etc.  The following relation between volume, mass, and mass-density applies, and can be used to determine the volume occupied by a particle, V=m/ρ.  The following properties must hold for a smoothing kernel: being normalized, positive and even.  We only use smoothing kernels with a compact support radius h. The basis formulation of SPH to approximate any quantity field Comparison SPH and FDM - Horizontally and their derivatives. SPH is originally designed for compressible flow problems. y=0.8 Readers are encourage to consult [Liu & Liu]122 for detailed information and recent trends in SPH methodology. Case Study 1 - Lid Driven Cavity Problem Figure 5.8 Comparison with FDM with SPH for Lid Driven Cavity To validate, the bench mark case of the lid driven cavity is considered and the results are compared with FDM on the same number of particles for Re =10 and 41 x 41 particles (See Figure 5.8).

P. Niedoba, L. Cˇerma, and M. J´ıcha, Meshfree methods for computational fluid dynamics, EPJ Web of Conferences 45 01068 (2013), DOI: 10.1051, epjconf/201345 01068. 121 Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. 122 M.B. Liu · G.R. Liu, “Smoothed Particle Hydrodynamics (SPH): an Overview and Recent Developments”, Arch Comput Methods Eng (2010) 17: 25–76, DOI 10.1007/s11831-010-9040-7. 120

94

Case Study 2 - Two-dimensional Convection–Diffusion Problem A meshless Local Method of Approximated Particular Solutions (LMAPS) is used to analyze problem described by the convection diffusion equation by [Mužík & Holičková]123. The method solves the steady convection-diffusion equation with reaction term. The discretized system of equations is derived via interpolation procedure and radial basis functions (RBF). The solution of the equation is performed over simple geometry with non-uniform velocity field and results are presented in the article. The LMAPS method is capable to produce stable solutions with results comparable to the analytical solutions. The local method of approximated particular solution (LMAPS) was proposed by [Cheng et al.]124 and was applied to elliptic problems and non-linear problems125. In LMAPS the domain is covered by cloud of scattered nodes. In the work on LMAPS reported so far, the support of the any computational node is taken to be a simple subdomain in a shape of a circle though in theory the domain can be of any shape, with the computational node in the center of the circle. The most often used interpolation for field variables Figure 5.9 The diagram of global domain Ω, local support were the moving least-squares, domain Ωs of point xs, global points x and local point xi though some researchers used different schemes for interpolation of the field variable and gradients over the circular boundaries. The area of interest Ω with the boundary ∂Ω is covered by points within the area and also on the global boundary (see Figure 5.9). Consider a local circular (or any simple shape e.g. rectangle) sub-domain ΩS centered at every point s. This sub-domain is called support domain and using the points in a particular support domain any function can be expressed using just nodal values126.

RKPM Method

The reproducing kernel particle method belongs to the category of finite integral methods, and is a modification of the SPH method. This method adds the so-called correction function to the SPH formulation to ensure certain order of consistency. The particle approximation of the function f(x) is defined

Juraj Mužík, and Martina Holičková, “Two-dimensional convection–diffusion problem solved using method of localized particular solutions”, MATEC Web of Conferences · January 2017. 124 C.S. Chen, C.M. Fan,P.H. Wen, “Numerical Methods for Partial Differential Equations”, 28, 506–522, (2012). 125 C.S. Chen, M.A. Golberg, M. Ganesh, “A.H.-D. Computers and Mathematics with Application”, 359–378, (2002) 126 See 105. 123

95

n

mj

j 1

ρj

 f(x)   

f( j ) C(x,  j )W(x  ξ j , h) Eq. 5.11

where C(x,  j ) is correction factor Lagrangian Description of Fluid Dynamics Using SPH

Interactive fluid dynamics is of essential interest in real-time applications, such as computer games or virtual surgery simulators. Using the smoothed particle hydrodynamics (SPH) method, a stable particle-based approach to solve the motion of interactive fluids using Lagrangian description. With focus on the simulation part we provide a thorough insight of the mathematical theory of particlebased fluids. The basic Eulerian formulation of an incompressible, isothermal fluid for 2-D flow with constant properties express as

  .u  0 , ρ   u.  u   p  μ. ( u)  f  t 

Eq. 5.12

Where μ is the viscosity of the fluid, and f is the sum of external force-densities acting on the fluid, e.g. gravity. Using particles instead of a grid simplifies the equations significantly. We assume that the amount of particles is constant during the simulation, and by keeping the mass fixed for each particle, it implies that mass conservation is guaranteed, and that conservation of mass can be omitted. Figure 5.10 depicts a basic layout of a particle-based fluid, which has been reduced to two-dimensions for reasons of clarity. The particles are represented by the dots. The circles represent the volume of each particle. In the Lagrangian formulation of a fluid the particles completely define the fluid, which implies that the particles move with the fluid. Compared to the Eulerian view this means that any field quantity now depends on Figure 5.10 Lagrange particle-based time, t, only. The particles carry mass, position, and fluid structure in 2D velocity, and will hold smoothed quantity approximations obtained from SPH. The acceleration for a Lagrangian fluid particle becomes the ordinary time derivative of its velocity. This is why the total derivative term (D/Dt) is reduced to a simple d/dt in the Lagrangian view. The basic Lagrangian formulation of the Navier Stokes equations for an incompressible, isothermal fluid is given by

ρ

du =⏟ −∇p + μ∇2 u + ⏟f dt f

Eq. 5.13

finternal

external

, F = finternal + fexternal , ai =

dui Fi = dt ρi

Where ai is the particle acceleration, finternal denotes to pressure and viscous forces, and fexternal assigned to gravity. Default Kernel

96

We learned about the first golden rule of SPH, and we also concluded that the isotropic Gaussian kernel was not fit to be used for our purpose. We need a default smoothing kernel with compact support for the inter-particle-based SPH computations required to solve for (8.1). Several suggestion discussed in 127 for SPH kernels. Among them are the B-Spline and Q-Spline kernels, where the QSpline is concluded to be the best kernel in terms of computational accuracy. However, the Q-Spline kernel requires the evaluation of the square root, which can be expensive if the kernel is often used. Instead we will use the 6th degree polynomial kernel suggested by128 as default kernel, which is given by

315 Wdefault (x, h)  64π4 9



2 3

 

 

  h2  x  0  



0 x h x h

945 2 2 2 x h  x 32π2 9 945 2 2  2 Wdefault (x, h)  h  x 3h 2  7 x 9 32π2 Wdefault (x, h) 

2



Eq. 5.14

The default kernel and its derivatives in one dimension can be depicted as:

Figure 5.11

The default kernel and its derivatives in one dimension for h=1

The default kernel and its derivatives are used for all smoothed quantity field approximations, except for the internal fluid force fields. For further information regarding various smoothing kernel and its application, please consult129.

J. Hongbin and D. Xin. "On criterions for smoothed particle hydrodynamics kernels in stable field". Journal of Computational Physics, 202, pp. 699–709, 2005. 128 M. Müller, D. Charypar, and M. Gross. “Particle-Based Fluid Simulation for Interactive Applications”. Proceedings of 2003 ACM SIGGRAPH Symposium on Computer Animation, pp. 154-159, 2003. 129 Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. 127

97

Numerical Time Integration To simulate the fluid flow, each particle is advanced through time using a global fixed time step Δt, Eq. 5.15 is employed to compute the particle acceleration, and the new particle position is obtained from integrating the acceleration numerically. In this section three different integration schemes will be introduced. 5.5.2.1 The Implicit Euler Scheme The Implicit Euler scheme is actually a semi-implicit method, as it is only the position update that is implicit. Semi-implicit Euler is based on the explicit Euler scheme, which probably is the most common integration method. In explicit Euler the position and velocity are updated in parallel. The semi-implicit Euler is no longer independent of the position and velocity updates as

x t Δt  x t  Δt u t Δt Eq. 5.15

5.5.2.2 The Verlet Scheme The velocity update is the same, but the position update uses the result from the velocity update to predict the new position,

x t Δt  2x t  xt-Δt  Δt 2u t

Eq. 5.16

The Verlet scheme is one of the computationally fastest integrators and it is usually very stable, as the velocity is given implicitly and will not get out of sync with the position. However, collision responses are not trivial to handle, as it includes modifying positions rather than velocities. 5.5.2.3 The Leap-Frog Scheme The leap-frog integration has got its name from the fact that the velocities leap over the positions, and vice versa, as illustrated in Figure 5.12 where the horizontal line represents time t, and the

Figure 5.12

The leap-frog mechanism

subscripts on the positions and velocities u indicate the specific time. The integration structure is implicit Euler and yields to:

ut  where

u t Δt/2  u t  Δt/2 2

u t  Δt/2  u t Δt/2  Δta t and u Δt/2

1  u 0  ta 0 2

Eq. 5.17

98

In theory, a time integration scheme will follow Newton’s 1st law, but numerical dissipation can reluctantly damp the linear motion of the particles. Typically, this is not a problem in physics-based animation, because the damping can be explained as a small scale air resistance or friction. Especially the Verlet scheme is easily influenced by numerical damping. We have chosen not to introduce any explicit damping in the time integrators, due to the different ways integrators handle damping. We rely on the viscosity force to provide the necessary numerical damping130. Collision Handling The small-scale working domain of interactive Lagrangian fluids is limited. A practical way of meeting a convincing environment of the fluid is to constraint the particle system within well-defined boundaries. Boundary containers, such as boxes, spheres, and capsules, are commonly used to constraint a fluid. When particles collide with a container they must stay inside its boundaries. Likewise, if particles collide with an obstacle, they may not penetrate or gain access to the interior of the object. Collision handling can be divided into two sub parts; collision detection and collision response. Further discussion is avoided here and interested readers are encourage to read131. Case Study 1 – Comparison of Weakly Compressible and Incompressible SPH The comparative study for the Weakly Compressible (WCSPH) and Incompressible (ISPH) Smoothed Particle Hydrodynamics methods over an airfoil is investigated by [Shadloo, et. al,]132. WCSPH and ISPH simulation results are compared and validated with those of a finite element method (FEM). The quantitative comparisons of WCSPH, ISPH and FEM results in terms of Strouhal number, and velocity gradients on the airfoil boundaries as well as the lift and drag values for the airfoil geometry indicate that the WCSPH method with the suggested implementation produces numerical results as accurate and reliable as those of the ISPH and FEM methods. 5.5.4.1 Formulation of Problem The SPH method relies on the idea of smoothing field properties over a bounded domain through the devised as in Eq. 5.18 which is referred to as the kernel approximation to an arbitrary function f (ri). In fact, this arbitrary function can be any hydrodynamic transport property such as temperature, enthalpy, density, viscosity and so forth. Here, W(rijj, h) is a kernel function, the angle bracket hi denotes the kernel approximation < >, is the position vector defining the center point of the kernel function, rij is the magnitude of the distance vector between the particle of interest i and its neighboring particles j, d3(rj) is a differential volume element within the total bounded volume of the domain _, and the length h defines the support domain of the particle of interest. The SPH technique in Equation (8.18) assumes that the fields of a given particle are affected only by that of other particles within a cutoff distance of the particle of interest with a smoothing radius kh where k is a coefficient associated with the particular kernel function. A smoothing kernel function is a piecewise spline that should satisfy several conditions: the normalization, the Dirac-delta function, compactness, spherical symmetry, and positive and even function properties. A thorough discussion on the details of these attributes of the kernel function can be found in133 and the references therein. In SPH literature, it is possible to find different forms of piecewise smoothing kernel functions possessing the above-listed properties such as Gaussian, cubic or quantic kernel functions. Throughout the present simulations, the compactly supported two dimensional quantic spline kernel. Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. See above. 132 Mostafa Safdari Shadloo, Amir Zainali, Mehmet Yildiz, and Afzal Suleman, “A robust weakly compressible SPH method and its comparison with an incompressible SPH”, Int. J. Numer. Meth. Engng,(2011). 133 Liu MB, Liu GR. Smoothed Particle Hydrodynamics (SPH): an overview and recent developments. Archives of Computational Methods in Engineering 2010. 130 131

99

     f( r1 )  f( r1 )   f( rj ) W( rij , h) d 3 rj

where

Ω

 (3  s ij ) 5  6(2  s ij ) 5  15(1  s ij ) 5  5 5  7  (3  s ij )  6(2  s ij ) W( rij , h)   478π7 2  (3  s ij ) 5  0 

if 0  s ij  1 if

1  s ij  2

if 2  s ij  3 if s ij  3

and s ij  ri j / h Eq. 5.18

5.5.4.2 Results Figure 5.13 compare the velocity contours of ISPH (upper), FEM (middle) and WCSPH (lower) for the angles of attack of 5 and 15 degrees (contours show the velocity magnitude, m/s) for the Re = 570. Similar to the previous benchmark problem, both WCSPH and ISPH results are in good agreement with those of the mesh dependent FEM technique. In all simulations, the results of WCSPH are as accurate as the ISPH ones. The figures further illustrate that the proposed algorithm is also very successful in simulating the flow around the airfoil geometry with different angles of attack across the flow field134. Case Study 2 - Dam Break Water Flow using Figure 5.13 Comparison of ISPH (upper), FEM (middle) and Lagrangian Description WCSPH (lower) velocity contours for the angle of attack of 15 The analysis of fluid flow is more degrees at Re = 570 (Courtesy of Shadloo105) an area of interest for physicists than computer scientists. However, in order to be convinced that the Lagrangian fluid method can produce realistic fluid motion we will examine the fluid flow. We will study the velocity flows produced by the dam-break problem for the water. In a classic dam-break problem the fluid is constrained inside a dam, and when the fluid is at rest the dam is broken, or the barricade that constrains the fluid is removed. The fluid now flows freely and often collides with a vertical wall. Frames from the dam-break of water simulated by particles are depicted on Figure 5.14 where the 134 Mostafa Safdari Shadloo, Amir Zainali, Mehmet Yildiz,

and Afzal Suleman, “A robust weakly compressible SPH method and its comparison with an incompressible SPH”, Int. J. Numer. Meth. Engng,(2011).

100

flow of water simulation time interval is 0.1s between each frame, from left to right, top to bottom. This is just a survey of how the visible water particles flow in the dam-break problem. Frames from the dam-break of water simulated by 2250 particles.

Figure 5.14

Dam-Break Flow of water

Case Study 3 - Dam Break using MLPG-RBF and Shallow Water Equations The application of the meshless local Petrov-Galerkin (MLPG) method to solve the shallow water equations (SWE) is investigated by [Mužík1 and Holičková]135. The shallow water equations (which also called the de Saint-Venant equations) are used to describe flow behaviors in bodies of water where the horizontal length scales are much greater than the flow depth, therefore, the 3D problem can be assumed as 2D. This localized approach is based on the meshless weak formulation with the use of radial-basis functions (RBF) as the trial functions. In this work, the numerical model is applied

Figure 5.15

Geometry and Water surface profile of the 2D dam-break problem at t =7.2 s.

Juraj Mužík1, Martina Holičková, ”Meshless simulation of dam break using MLPG-RBF and shallow water equations”, MATEC Web of Conferences 117, 00127 (2017). 135

101

to simulate a dam-break problem as one of most descriptive benchmark problems for SWE. As a result, the adopted meshless method not only shows its algorithm applicability for class of problems described by SWE, but also brings more efficiency than several conventional mesh-based methods. The problem models a partial dam-break for a rapid opening of a sluice gate with a non-symmetric breach and its ability to simulate discontinuous flows. The computational domain is a 200 m by 200 m region. A dam is located in the middle of the domain with 10 m thickness. The initial water depth is 10 m on one side and 5 m on the other side of the dividing wall. At time t = 0, the dam fails, and the water is released through the 75 m wide non-symmetric breach, as shown in Figure 5.15 - (left). When the downstream water depth is 5 m, the flow is subcritical everywhere. The boundary conditions at x=0 and x=200 m are assumed to be transmissivity and all other boundaries are considered as reflective. At the instant of the dam break, water is released through the breach, forming a positive wave propagating downstream and a negative wave spreading upstream. We compare our results by at t = 7.2s (Figure 5.15 - right), when the waves have not yet reached all the boundaries, with least-squares finite-element method (LSFEM). The left moving positive wave and right moving negative wave are both well resolved. The results were confirmed more stable to capture the fine details of the flow. The behavior of the numerical scheme is in satisfactory agreement with computed results of these researches. Case Study 4 - SPH Method for Evaporating Multiphase Flows Because evaporation is encountered in many engineering applications, such as fuel droplets in engines, liquid sprays, and material processing, a numerical method to accurately predict liquid evaporation is of great importance. Common engineering models for predicting droplet evaporation assume that the liquid droplet is a point source with homogeneous properties . The primary concern of these models is the mass transfer rate without consideration of the gradient in the droplet or the liquid-gas interface. While such models are useful in engineering applications, advanced numerical methods are needed to reveal the details of the evaporation process. The dynamics of evaporating flows involves phase change and energy transfer at the liquid-gas interface, diffusion of vapor species in the gas phase, and multiphase flows with sharp interfaces. Because of the complexity of the evaporation problem, it is challenging to make a detailed numerical simulation. The main numerical challenges in simulating evaporating flows include the treatment of phase change and the sharp discontinuity of fluid properties at the liquid-gas interface. Phase change due to evaporation causes mass transfer from one phase to another phase. The discontinuity at the liquid-gas interface, of variables such as density ratio, also leads to numerical difficulties. The intent of [Xiufeng Yang & Song-Charng Kong]136 this work is to provide a numerical method, based on smoothed particle hydrodynamics (SPH), to simulate multiphase flows with evaporation. The SPH method is a Lagrangian mesh-free particle method. In SPH, a continuous fluid is discretized using SPH particles, which carry physical properties, such as mass, density, pressure, viscosity, and velocity. Since SPH is a mesh-free method, a smoothing kernel is introduced to connect the neighboring particles. The variables and their spatial derivatives are discretized in summations over particles. In the SPH method developed for this study, the SPH particles near the interface are allowed to change their mass to model the process of evaporation at the interface. The rate of mass change of SPH particles due to evaporation depends on the vapor mass fraction in the gas phase and the saturated vapor mass fraction at the interface. The saturated vapor mass fraction can be predicted by the (Clausius-Clapeyron) correlation. During the process of evaporation, the mass of a liquid SPH particle at the interface increases, while the mass of a gas SPH particle decreases. To constrain the mass of individual SPH particles, a particle will split into smaller particles if its mass is large enough or merge into a neighbor particle if its mass is small enough. 136 Xiufeng Yang And Song-Charng Kong, “Smoothed Particle

Flows”, Physical Review E 96, 033309 (2017).

Hydrodynamics Method for Evaporating Multiphase

102

5.5.7.1 Basic Formulations of the SPH Method In SPH, the value of a function f (r) at point ra can be approximated using the following integration:

f(ra )   f(r)W(ra  r, h) dV

Eq. 5.19

where W is a kernel function and dV is a differential volume element. The parameter h is referred to as a smoothing length, which determines the size of the integral domain. In this paper, the following hyperbolic-shaped kernel function in two-dimensional space is used:

s 2 - 6s  6 0  s  1 1  W(s, h)  (2 - s)3 1 s  2 2  3h  0 2s 

Eq. 5.20

In the SPH method, a continuous fluid is discretized into properties, such as mass m, density ρ, velocity u, and viscosity μ. Then the integration of Eq. (5.17) is discretized in particle summation as follows

f(ra )   f(rb )W(ra  rb , h) b

mb ρb

Eq. 5.21 5.5.7.2 Evaporation of a Static Drop The evaporation of a static drop was simulated using the proposed SPH method. The initial radius of the drop is R0 = 0.15 mm. The initial temperature of the drop is 353 K. The drop was located at the center of a square computational domain, which was filled with gas. The length of the square was 1.2 mm. The initial temperature of the gas was 373 K. The temperature of the boundary was also 373 K, and did not change during the simulation. These temperatures were Figure 5.16 Snapshots of the Evaporating Drop at chosen in order to be consistent with and to different times using SPH allow comparisons with the conditions in the literature. The initial vapor mass fraction in the gas phase was zero. The vapor mass fraction of the boundary remained zero. The initial particle spacing was 0.02 mm. Figure 5.16 shows that the size of the drop decreased slightly. The decrease in the drop size, as compared with the result from a two-dimensional (2D) axisymmetric level-set method. It should be noted that the 2D circle used in this study corresponded to the cross section of a three-dimensional (3D) cylinder of infinite length, while the 2D axisymmetric circle used corresponded to a 3D sphere. 5.5.7.3 Evaporation of a Dynamic Drop Impacting on a Hot Surface The proposed method was also used to simulate the evaporation of a drop impacting a hot surface. The initial radius of the drop was R = 0.25 mm and the initial velocity of the drop was U = 2m/s. The height and length of the computational domain were 1.5 and 5.0 mm, respectively. The drop was

103

located at the center of the domain and was surrounded by gas. The initial temperature of the drop was 353 K. The initial temperature of the gas was 373 K. The temperature of the boundaries was also 373 K, and did not change during the simulation. The initial vapor mass fraction in the gas phase was zero. The vapor mass fraction of the boundary remained zero. The initial particle spacing was 0.02 mm. Figure 5.17 shows the evolution of drop impact on a hot surface. After the drop touched the surface, it spread and formed a film on the surface. At approximately 1.0 m/s, a tiny crown like structure was formed around the rim. Later, the crown merged with the film, and the film receded. Finally, the film reached an equilibrium size. Since the initial temperature of the drop was lower than the gas temperature, the heat transfer from the surrounding gas to the drop led to the decrease in the local gas temperature. However, the drop temperature also decreased slightly because evaporation consumed energy, as discussed earlier. When the drop spreads on the hot surface and forms a film, heat transfer from the hot surface to the film increased the temperature of the film. The intent of this paper was to present an SPH method to simulate evaporating multiphase flows. This method accurately models the process of evaporation at the liquid-gas interface and the diffusion of the vapor species in the gas phase. An evaporating mass rate was derived to calculate the mass transfer at the interface. To model the process of phase change from the liquid phase to the gas phase, mass was allowed to transfer from a liquid SPH particle to a gas SPH particle. Thus this proposed method, unlike the traditional SPH method, allows change in the mass of an SPH particle. Additionally, particle splitting and merging techniques were developed to avoid the large difference in the SPH particle mass.

Figure 5.17

Evolution of Dynamic Drop impact on a hot surface using SPH

5.5.7.4 Concluding Remarks In general, the results show that the method proposed in this paper successfully replicated the physical process of evaporating flows, such as heat and mass transfers and the diffusion of the vapor species. The example was to simulate the evaporation of a static drop–because of evaporation, the present SPH method predicts the decreases of both the temperature of the interface and the size of the drop. The last example was to simulate the evaporation of a drop impacting a hot surface. The temperature of the liquid-gas interface decreased at first because of evaporation, especially at the rim of the film. Then the temperature increased because of the heat transfer from the hot surface to the liquid. In summary, the results of this study indicate that the numerical method proposed in this paper can be successfully used to produce an evaporating flow simulation. Additional information

104

can be attained from [Yang & Kong]137.

137 Xiufeng Yang And Song-Charng Kong, “Smoothed Particle

Flows”, Physical Review E 96, 033309 (2017).

Hydrodynamics Method for Evaporating Multiphase

105

6 HVAC in Building and Related Issues In early age of construction, the most of building related issues such as ventilation analysis, wind loading, wind environment etc. were conducted by the wind tunnel tests, but today all these test can be done effectively with CFD technique. CFD technique can resolve all above mentioned issues in very short time period and it is very economical as well as strong approach than the older one (experimental).[3] Recently Computational fluid dynamics is used as a sophisticated airflow modelling method and can be used to predict airflow, heat transfer and contaminant transportation in and around buildings. CFD plays an important role in building design, designing a thermally-conformable, healthy and energy-efficient building. CFD can examine the effectiveness and efficiency of various Heating ventilation and air conditioning (HVAC) systems by easily changing the different types and location of different components of diffuser types and locations, supply air conditions and system control schedules. Furthermore, CFD helps in developing passive heating/cooling/ventilation strategies (e.g. natural ventilation) by modelling and optimizing building site-plans and indoor layouts. Globally building sector shares approximately 40% of total energy consumption138. In present era, there is a huge gap in energy consumption and energy production. As building sector share a huge amount of the total consumption, hence it becomes essential to investigate the optimum configuration for building to reduce the building's share of energy. In order to achieve this, CFD can play an important role. Energy simulation and CFD programs are important building design tools which are used for evaluation of building performance, including thermal comfort, indoor air quality mechanical system efficiency and energy consumption139. CFD in buildings mainly used for one or more followings purposes:   

Thermal analysis: through walls, roof and floor of buildings Ventilation analysis Orientation, site and location selection of buildings based on local geographical and environmental conditions.

Thermal Analysis in Buildings In buildings, heat transfer takes place in its all modes i.e. conduction, convection and radiation. In order to reduce heat losses from buildings, CFD analysis can be done for the optimum configuration of composite walls, roof and floor. The differential form of the general transport equation is as follows:

∂(ρφ) + ∇(ρ𝐮φ) = ∇(k∆φ) + S⏟ ⏟ ⏟ φ ⏟∂t Eq. 6.1

transient

Convection

Diffusion

Source

The numerical solution of above equation can be obtained by finite difference method (FDM), finite volume method (FVM) and finite element method (FEM). In buildings, for heat transfer analysis, the scalar function ф in Eq. 6.1 is replaced by Temperature (T), diffusion coefficient Γ is replaced by thermal conductivity k and the source term Qi is replaced by heat generation term S or by any heat radiation source or by both (depending upon the nature of source available) and we have different forms of equation for different cases. For simplicity and easy understanding, only 1-Dimensional

Wikipedia. Zhai, Zhiqiang John; Chen, Qingyan Yan (2005), "Performance of coupled building energy and CFD simulations", Energy and Buildings, 37 (4): 333, 138 139

106

cases have been discussed. In buildings the heat transfer analysis can be done for all parts of buildings (walls, roof and floor) in following two ways 1. Steady State Thermal Analysis 2. Transient Thermal Analysis Ventilation Analysis The ventilation study in buildings is done to find the thermally comfortable environment with acceptable indoor air quality by regulating indoor air parameters (air temperature, relative humidity, air speed, and chemical species concentrations in the air). CFD finds an important role in regulating the indoor air parameters to predict the ventilation performance in buildings. The ventilation performance prediction provides the information regarding indoor air parameters in a room or a building even before the construction of buildings. These air parameters are crucial for designing a comfortable indoor as well as outdoor environment. This is because the design of appropriate ventilation systems and the development of control strategies need detailed information regarding the following parameters;   

Airflow Contaminant dispersion Temperature distribution

The aforesaid information’s are also useful for an architect to design the building configuration. From the last three decade, the CFD technique is widely used with considerable success in building. Recently ventilation and its related fields has becomes a great part of wind engineering. A ventilation study can be done using wind tunnel investigation (experimentally) or by CFD modeling (theoretically). Natural ventilation system is always preferred over the forced ventilation system, as it causes to save the burning of fuel, which is economical as well as nature friendly. In present era, due to development of a lot of CFD software and other building's energy simulation software, it becomes quite easy to assess the possibility of natural/forced ventilation system in a building. CFD analysis is quite useful than the experimental approach because here we can find other related relations among the variables in post-processing. The data obtained either experimental or numerically is useful in two ways: 1. better comfort of user. 2. It provides the data which is used as input to the heat balance calculation of the buildings. 6.1.1.1

Numerical Simulations of the Effect of Outdoor Pollutants on Indoor Air Quality of Buildings next to a street canyon To explore the effect of traffic pollution on indoor air quality of naturally ventilated buildings in the vicinity of a street canyon, the wind flow and pollutant distributions in and around buildings with different Window Opening Percentages (i.e. WOP, the percentage of the total window opening area to the total facade area) were investigated by three-dimensional numerical simulations [Yang et al.]140. The numerical results show that the WOP changes the pressure distribution around the downstream building, which is due to the infiltration of air into the street canyon through the opening windows of both the upstream and downstream buildings. When the indoor air of the downstream building is supplied by the outdoor air from the street canyon, the ventilation flux will be increased with increasing WOP. If the indoor air is taken in from the protected side of the downstream building, 140 Fang Yang, Yanming Kang, Yongwei Gao, Ke Zhong, “Numerical simulations of the effect of outdoor pollutants

on indoor air quality of buildings next to a street canyon”, Building and Environment, Volume 87, May 2015.

107

however, the trend of the ventilation flux is found reverse. The results also indicate that the effective source intensity, which is introduced to quantify the amount of traffic pollutant entering into buildings through unit ventilation area, decreases as the WOP increases. When the WOP reaches 10%, the averaged effective intensity is reduced by 30% compared to the reference case when all windows are closed. It means that if a naturally ventilated room in the downstream building has a fixed ventilated area over different seasons, the room will take in more pollutants from outdoors in winter than in other seasons. Figure 6.1 illustrates the effect of widow opening percentage (WOP) on indoor air quality. Another study using FlowVent® shows effect of window loading in venting the outdoor pollutant as shown in Figure 6.2, while Figure 6.3 displays air pollution effects between buildings separated by a street.

Figure 6.1

Impact of Window Opening Percentage (WOP) on indoor air quality

HVAC & Environmental Issues

Heating, Ventilating and Air Conditioning, HVAC, is a huge field. HVAC systems include a range from the simplest hand-stoked stove, used for comfort heating, to the extremely reliable total airconditioning systems found in submarines and space shuttles141. Cooling equipment varies from the small domestic unit to refrigeration machines that are 10,000 times the size, which are used in industrial processes. Depending on the complexity of the requirements, the HVAC designer must consider many more issues than simply keeping temperatures comfortable. This chapter will introduce you to the fundamental concepts that are used by designers to make decisions about system design, operation, and maintenance. The title, “HVAC,” thus captures the development of our industry. The term “air conditioning” has gradually changed, from meaning just cooling, to the total control of:     

Temperature Moisture in the air (humidity) Supply of outside air for ventilation Filtration of airborne particles Air movement in the occupied space

Robert McDowall, P. Eng. “Fundamentals of HVAC Systems”, Butterworth-Heinemann publications, ISBN–10: 0-12-372497-X, 2006. 141

108

We will use the term “air conditioning” to include all of these issues and continue to use “HVAC” where only some of the elements of full air conditioning are being controlled. The textbook Principles of Heating, Ventilating, and Air Conditioning142, starts with a concise and comprehensive history of the HVAC industry. HVAC evolved based on: 







Technological Figure 6.2 Effect of window loading in outdoor pollutants discoveries, such as refrigeration, that were quickly adopted for food storage. Economic pressures, such as the reduction in ventilation rates after the 1973 energy crisis. Computerization and networking, used for sophisticated control of large complex systems serving numerous buildings. Medical discoveries, such as the effects of Figure 6.3 Pollution between two buildings separated by a street second hand smoke on people which influenced ventilation methods.

Introduction to Air-Conditioning Processes As mentioned earlier, the term “air conditioning,” when properly used, now means the total control of temperature, moisture in the air (humidity), supply of outside air for ventilation, filtration of airborne particles, and air movement in the occupied space. There are seven main processes required to achieve full air conditioning and they are listed and explained below. The processes are: 1. Heating - the process of adding thermal energy (heat) to the conditioned space for the purposes of raising or maintaining the temperature of the space. 2. Cooling - the process of removing thermal energy (heat) from the conditioned space for the purposes of lowering or maintaining the temperature of the space.

Sauer, Harry J. Jr., Ronald H. Howell, William J. Coad. 2001. “Principles of Heating, Ventilating, and Air Conditioning”, Atlanta: ASHRAE. 142

109

3. Humidifying - the process of adding water vapor (moisture) to the air in the conditioned space for the purposes of raising or maintaining the moisture content of the air. 4. Dehumidifying - the process of removing water vapor (moisture) from the air in the conditioned space for the purposes of lowering or maintaining the moisture content of the air. 5. Cleaning - the process of removing particulates, (dust etc.,) and biological contaminants, (insects, pollen etc.,) from the air delivered to the conditioned space for the purposes of improving or maintaining the air quality. 6. Ventilating - the process of exchanging air between the outdoors and the conditioned space for the purposes of diluting the gaseous contaminants in the air and improving or maintaining air quality, composition and fresh ness. Ventilation can be achieved either through natural ventilation or mechanical ventilation. Natural ventilation is driven by natural draft, like when you open a window. Mechanical ventilation can be achieved by using fans to draw air in from outside or by fans that exhaust air from the space to outside. 7. Air Movement - the process of circulating and mixing air through conditioned spaces in the building for the purposes of achieving the proper ventilation and facilitating the thermal energy transfer. The Role of CFD In HVAC System Optimization Computational fluid dynamics (CFD), allows engineers to visualize flow velocity, density, thermal impact and chemical concentrations for any region where the flow occurs, enabling engineers to analyze the problem areas and suggest the best solutions. While CFD is used across the construction industry for analysis and design optimization of an HVAC system, some organizations and individuals have been slow to fully utilize it within their practices, citing restrictions such as cost, unreliability, and inaccessibility. CFD is used extensively when designing HVAC systems for non-standard systems, e.g., stadiums, large atriums, concert halls, natural ventilation systems, smoke ventilation etc. and most of these systems could not be accurately designed without using CFD143. 6.2.2.1 Why Use CFD Analysis In HVAC Design Engineers designing HVAC systems face the challenge of meeting aggressive sustainability and energy-efficiency targets while delivering comfortable environments at a reasonable cost144. Traditional design methods involve the use of hand calculations requiring many simplifying assumptions, which limit the accuracy of calculations. Incorporating CFD simulation into the design process offers a level of reassurance, allowing a complex design to be tested as a computer model before any construction cost is incurred. Design certainty can be established as scenarios can be accurately simulated with the calculated results graphical displayed providing an “easy to relate to” representation. More and more, engineers are moving to CFD to compute airflow patterns and space temperatures based on complete 3D geometries with fewer assumptions, resulting in a greater level of accuracy. Performance Prediction - One of the most notable advantages of using CFD in HVAC design, is the ability to simulate fluid flows and analyze HVAC performance without actually installing the HVAC system or even building a prototype. This allows for significant problems, and ultimately solutions, to be identified and devised to enhance a building's overall HVAC performance. Provides Key HVAC Design Parameter Information - Due to key advances in HVAC/IAQ technology, broader and more detailed information about the flow within an occupied zone is required, and the CFD technique satisfies this requirement better than any other method (e.g. experimental or theoretical methods).

143 144

Envenio Blog, 2016. see previous.

110

Using CFD For Validation/Optimization of HVAC Design Parameters - An HVAC system and the finer details such as location and number of diffusers and exhausts, temperature and flow rate of the supplied air etc. can be optimized and validated for an occupant structure and for increased occupant comfort. Modification Of Malfunctioning HVAC Systems - Design modification can be suggested, these modification can further be simulated and any kind of malfunctioning of HVAC system can be mitigated for improved performance and better HVAC within a building. Examples of HVAC CFD Analysis In Practice        

Industrial ventilation design Swimming pool ventilation General office/room simulations Fume hood design Effective smoke evacuation in smoking lounges Fire simulations for ware houses Thermal assessment of data centers and server rooms Smoke and fire propagation simulations and implementation of fire safety in occupant structures

CFD is used extensively when designing HVAC systems for non-standard systems, e.g., stadiums, large atriums, concert halls, natural ventilation systems, smoke ventilation etc. and most of these systems could not be accurately designed without using CFD. Case Study 1 - Aircraft Hangar Fire & Smoke Model Aircraft hangars, by their very nature, pose a unique challenge for fire safety engineers. Large, open floored areas with high roof decks house aircraft contents worth millions of dollars. In addition to the large amounts of jet fuel, a number of the maintenance activities that take place within hangars provide a host of ignition sources. Large aircraft wings, fuselages and scaffolding also have the potential to restrict fire detection, suppression and the flow of smoke, presenting a potentially lethal cocktail. For fire safety design to be effective, a number of issues must first be considered. These include fire source, heat transfer, fire detection and alarm, human behavior, smoke movement, toxicity and pollution. CFD modelling is proving highly effective in this area to solve fundamental equations about fluid flow and heat transfer, both commonly associated with fire. The result is that predictions can be made as to how smoke and heat will move throughout the hangar. Being able to predict how fire, smoke and heat will spread inside an individual hangar, has life-saving consequences, enabling the most effective fire-safety procedures to be implemented, and risk assessment to be more accurate. Ventilation design depends on several factors such as placement of the diffusers, diffuser geometry or placement of the exit vent. Along with this, fluid flow conditions such as velocity or temperature of inlet flow determine the air flow pattern inside the aircraft. We performed a case study for HVAC design of an aircraft hangar. The aircraft hangar has twelve inlets, shown in red and three outlet ducts, shown in blue on the roof, see below in ‘reference images’. All other boundaries are no-slip walls. Molded fluid is air, treated as an ideal gas with temperaturedependent variable density. Air enters at 6 m/s with a fixed temperature of 50 C . The aircraft surfaces have a heat transfer coefficient equal to 15 W/m2-K and a reference temperature of 30 C. Hangar walls have a heat transfer coefficient of 10 W/m2-K and a reference temperature of -3 C. A subdomain located aft of the starboard elevator is the location of a simulated fire, shown as a small square in the figure above. In this subdomain smoke source is active. Smoke is supplied at a rate of 1 kg/sec. These sources create a three-dimensional smoke plume that billows upward under buoyant forces, and advects outward with flow created by the ventilation system.

111

6.2.3.1 Results Figure 6.4 show the temperature distribution on a vertical slice that transects the aircraft fuselage. Velocity magnitude distributions show speed of air flow on a horizontal slice halfway up the aircraft fuselage and on a vertical slice that transects the fuselage. Scalar (smoke) concentration is shown on a horizontal slice halfway up the aircraft fuselage, and two images show the vertical structure and internal concentration distribution of the smoke plume at the end of the simulation. Case Study 2 - CFD Modeling Approach for HVAC Systems Analysis An (HVAC) system consisting of a simplified building with one room was modelled and simulated considering both the internal and the external conditions, was investigated by [Mahu, et al.]145. The numerical model was designed to concurrently take into account:

Figure 6.4

Study for HVAC design of an aircraft hangar

(1) the internal buoyancy-driven convective heat transport, (2) the conductive heat flow through the solid walls, (3) the external convective cooling induced by the lower temperature wind, and (4) the internal and external radiative heat transfer. In order to avoid making approximations by specifying an average value for the external heat transfer coefficient (HTC), the external boundary of the numerical model was modified to include a significant external volume around the building. Thus, the flow around the building was directly simulated and the external and internal HTCs were computed on-the-fly. The coupled fluid flow thermal transfer simulation was performed to evaluate the internal environment comfort level. 6.2.4.1 Modeling and Simulation Approach In order to increase the level of solution accuracy, a coupled fluid flow, thermal transfer solution was preferred. This implies that the numerical model must be able to simulate not only the internal and external heat transfer, but also the internal and external flow, at one time. A combined approach of using ANSYS Airpak 3.0 for numerical model preparation and final post-processing and ANSYS Fluent 12.0 for solution calculation and initial post-processing, proved to be optimal. The predefined geometry construction and meshing features of Airpak software greatly facilitated the preprocessing step, while the parallel processing capabilities of Fluent software offered a very short solution turnaround time. A fully structured, multi-block discretization was created using the semi-automated tools in Airpak. The unsteady-state Reynolds-Averaged Navier-Stokes (RANS) model was selected, with RNG k-ε turbulence modeling. No solar heat input was considered. All material data was taken from the available standard Airpak material libraries. The RNG k-ε turbulence model is similar to the R. Mahu, F. Popescu and I.V. Ion, “CFD Modeling Approach for HVAC Systems Analysis”, Chem. Bull. "POLITEHNICA" Univ. (Timisoara), Volume 57(71), 2, 2012. 145

112

standard k-ε model, but uses an improved derivation based on a statistical technique, called “renormalization group theory”. Several terms in the k and ε transport equations have been modified, while the model constants, empirical for the standard k-ε model, have been analytically derived.

Figure 6.5

Building Schematic with internal configuration

6.2.4.2 Results and Discussion The results indicate that the flow field is highly non-uniform both on the inside and on the outside of the building. A complex pattern of airflow is present on the inside, mainly driven by the 3000W heat source on the western wall. The contribution of the other heat sources (computer and lamps) is, nevertheless, significant. On the outside, large recirculation regions form on all sides except for the northern face, which is a typical behavior for a bluff body flow, including the unstable, vortexshedding wake. This suggests a highly variable (space and time) HTC distribution on the external surfaces. Although not verified with the current model, it is most probable that this HTC distribution greatly depends on the wind direction, and a full wind spectrum should be simulated and results recorded in the case of a real building design process. The wind-chill factor is significantly felt including on the internal surfaces of the building, more visibly on the lower insulating parts, for instance the door and window surfaces, which suggests that the heat loss contribution of these areas is very important and should be carefully taken into consideration during the design stage. The temperature field distribution and values recorded around the human model can be considered acceptable, therefore under the simulated conditions, the thermal regulating inside heat source seems to be adequate. Figure 6.6 demonstrate some of the

113

available post-processing capabilities: data contours on planes or slices with various orientations, external or internal path lines, colored by any given variable, data contours on domain boundaries or

Contours of velocity magnitude horizontal plane

Contours of velocity magnitude, vertical plane

Temperature contours on the internal surfaces

Streamlines inside the room, coloured by local temperature

Figure 6.6

Post Processing of Results

114

7 CFD Applications in Other Areas Recently CFD finds very wide application in different areas of science and engineering; some examples are146:             

Aerodynamics of aircraft and vehicles - Lift and Drag Hydrodynamics of Ships Power plant - Combustion in Internal Combustion Engines (ICE) and Gas Turbines Turbo machinery - Flows inside rotating passages, Diffusers etc. Electrical and Electronics Engineering - Cooling of Equipment Including Microcircuits. Chemical Process Engineering - mixing and separation and polymer molding. Marine Engineering - loads on off-shore structure. Environmental Engineering - Distribution of Pollutant and Effluents. Hydrology and Oceanography - flows in rivers, estuaries and oceans. Meteorology - Weather Prediction. Biomedical Engineering - blood flows through arteries and veins. Food Processing External and internal environment of buildings: wind loading, ventilation analysis and heating/cooling load calculations.

Food Processing CFD applications in food industry may assist in a better understanding of the complex physical mechanisms. [Schott]147, [Quarini]148 have reviewed the general application of CFD to the food processing industry. Moreover, other literatures are also available on specific CFD application areas such as: Clean-room design, Refrigerated transport, Static mixers, and Pipe flow. Since CFD technique can be of great benefit to the food processing industry, fast development has taken place in the past few years. CFD, as a tool of research for enhancing the design process and understanding of the basic physical nature of fluid dynamics can provide benefits to the food processing industry in many areas, such as Drying, Sterilization, Mixing, Refrigeration, Crystallization, Pasteurization and other application areas149. Drying Drying is a common food manufacturing process. The drying rate is a strong function of air flow or air velocity. Therefore, it is of great importance to know the air flow and velocity in the drying chamber, thus leading to know the areas of adequate air velocities for proper drying. However, air flow and air velocity are difficult to measure during operation because several sensors are needed to be placed at various directions of air flow and locations. Since there are some difficulties in modelling the complex phenomena, especially the gas turbulence, CFD is a powerful tool to aid the prediction of drying process. CFD has been used to predict the air flow and velocity during drying. Drying tests of several fruits were performed and the result showed that the degree of fruit dryness depended on its position within the drier. Determination of pressure profiles and air velocities by CFD showed that Versteeg, H., “An Introduction to Computational Fluid Dynamics”, Pearson Publications. ISBN 978-81-3172048-6, (2009). 147 Scott GM (1977), “Simulation of the flow of non-Newtonian foods using computational fluid dynamics”, Campden & Chorleywood Food Research Association R & D Report No. 34, UK. 148 Quarini J (1995), “Applications of Computational fluid dynamics in food and beverage production”. Food Sci Technol Today 9: 234-237. 149 Bin Xia, Da-Wen Sun, “Applications of computational fluid dynamics (CFD) in the food industry: a review”, Computers and Electronics in Agriculture 34 (2002) 5–24. 146

115

the main cause of the variations in drying rates and moisture contents was the lack of spatial homogeneity of air velocities within the drier. With the aid of CFD, researchers studied velocity fields in a modern sausage drier in order to provide information on air circulation inside the drier, which showed that CFD was able to predict the effects of filling level on air-flow patterns and also to identify measurement errors in areas where the main air flow direction was horizontal150. However, the quantitative comparison between the simulated and measured air velocities showed wide discrepancy with means of absolute differences of about 0.6 m/s. Although, the flow pattern and air velocity in the drier can be predicted using CFD modelling, further study on how to control the drying process and to reduce the energy cost is still a research topic for CFD modelling. Meanwhile, more attention should be paid on the assumptions such as spatial homogeneity because of such assumptions could lead to inaccuracy in prediction. CFD has also been used to investigate the performance and the design of spray dryers in the food industry. Spray dryers are used to produce products such as milk and coffee powder, as well as detergents. However, the design of spray dryers for the food industry is difficult because the performance of spray dryers is heavily influenced by the complexity of air and spray flow patterns inside the dryers. Therefore, there is considerable scope for the application of CFD simulation including optimum design of spray dryers and solutions for operational problems, such as wall deposition. In the past several years, researches, such as modelling and measuring the air flow pattern in a co-current pilot plant spray dryer (Kieviet et al., 1997) and analyzing the effects of air inlet geometry and spray cone angle on the wall deposition rate in spray dryers have been performed. All these studies show that there appears to be a large scope for using CFD for other purposes. For example, CFD can be used to simulate the air flow in a spray dryer in two dimensions and calculate the trajectories and the course of the drying process of the atomized particles. Straatsma151 developed a drying model utilizing turbulence model to calculate the gas flow field and showed that the drying model was an effective tool in giving indications of how to adapt the modelling in industrial dryers to obtain a better product quality or to optimize the drying performance of the unit. However, as the applications and specifications of dryers become more and more complex, so does the need for improved test work in pilot plants, and CFD simulations become more important in providing quick and valuable information. Sterilization It is known that consumer demands for food products focus on safety, product quality and cost152. Therefore, it is of great necessity to enhance quality and assure safety of the food supply. Sterilization is an important technique for food storage and preservation. CFD can be used to study both temperature distribution and flow pattern of food in the process of sterilization so as to optimize the quality of food products. Thermal processing remains the most significant technique of sterilization which results in microbial inactivation, but in the meantime, quality loss and flavor development. Excessive heating will affect food quality and its nutritive properties. With the application of CFD, there has been a number of studies to optimize the thermal sterilization of foods. These studies had led to substantial improvement on the optimal control of the process and the retention of the nutritional and sensory quality of the food. Another researches carried out a series of research work in canned food sterilization with CFD simulation. The work varied from those simulating the changes of bacteria diffusion and their transient spatial distribution during sterilization process to those simulating natural convection heating within a can of liquid food during sterilization. It is only in See above. Straatsma, J., Houwelingen, V.G., Steenbergen, A.E., Jong, P.D.,“Spray drying of food products:Simulation model”, Journal of Food Engineering 42 (2), 67–72. 152 Bin Xia, Da-Wen Sun, “Applications of computational fluid dynamics (CFD) in the food industry: a review”, Computers and Electronics in Agriculture 34 (2002) 5–24. 150 151

116

recent years that the food pouches have been introduced to the market and, therefore, little or no study has been executed on sterilization of food in pouches. CFD code was used for the purpose to simulate the transient temperature, velocity profiles and the shape of the slowest heating zone in sterilization of carrot soup in pouches. The modelling of a continuous sterilization process to optimize the quality of safe food has also been developed and the results showed that CFD modelling could be of significant help to the liquid food sterilization. However, all of these investigations about CFD application in sterilization are on the thermal sterilization in the limited area of liquid foods. There are still remains many challenges in the area of sterilization with the application of CFD. For instance, Ultra-violet, visible and infra-red light surface sterilization, plasma/corona sterilization, electrons and X rays sterilization, nascent oxygen/ozone sterilization of fruits and vegetables, pressure sterilization of fresh fruit juices and cooked ham. The application of CFD in these sterilization fields of food is still to be developed in the future. Moreover, assumptions are normally made to simplify CFD modelling. For example, specific heat, thermal conductivity and volume expansion coefficient were assumed to be constants in the study by Abdul Ghania et al. (1999a) although, all the parameters are temperature dependent. More studies should be carried out to minimize these assumptions and thus to improve the accuracy of CFD prediction. Another area for the application of CFD is the real time control of the sterilization. Effective real-time monitoring of sterilization will improve the quality and safety of foods. Above all, the ultimate objective is to optimize the sterilization process of the food and to obtain food with excellent quality and safety. With the aid of CFD application, the sterilization process can be improved. Mixing In the food processing industry, mixing is one of the most common operations. Mixing applications involve the substances of gas, liquid and solid. And the mixing of fluids is one of the most important unit operations for the food processing industry. However, mixing is a complicated process as regards to the multiphase turbulence during mixing and the design of a mixer. CFD is a powerful tool for the modelling of mixing processes. It provides a natural method to link food process and fluid flow information. During mixing, a common method of enhancing the process is to use some kind of stirrer or paddle. CFD codes have been applied in optimizing the mixing process to minimize energy input and to shorten the processing time. Therefore, research has been carried out on the distribution of energy in mixing vessel and on the effects of mixing quality when the stirrer is in different position. Such prediction of the mixing process within these units was impossible in the past. Recently, CFD modelling of mixing in stirred tanks has been carried out by [Sahu]153, with several important points about impeller-vessel geometry, energy balance and the linkage between the flow field and the design objective being addressed. Although no experiments were carried out in the study, the predicted values of mixing time were compared with published experimental data and the agreement was within 5–10%. This study will benefit the design of the stirred tanks, and some technical problems about the impeller types, mixing time and equipment size can be avoided. The design of mixing devices is an important topic in analyzing the mixing process. Therefore, some research work focusing on the application of CFD on the design of mixing devices, for instance, shallow bubble columns, has been investigated. The results of these studies will provide benefits including easy measurement of the drop size distribution, the velocities of the phases and the degree of mixing, and accurate description of the turbulence, swirling and vortices generated in the mixer. Thus, all the development of CFD application on the mixing in the food processing industry will lead to more accurate monitoring, control and optimizing of mixing process. In the meantime, it will form a good basis for mixing process improvement.

Sahu, A.K., Kumar, P., Patwardhan, A.W., Joshi, J.B., “CFD modelling and mixing in stirred tanks”, Chemical Engineering Science 54 (13–14), 2285–2293, 1999. 153

117

Refrigeration The consumption of frozen foods has increased continually in the past years because frozen foods have demonstrated good food quality and safety record. Refrigeration can slow down bacterial growth and preserve food. Therefore, researchers have recently applied CFD in the modelling of heat and mass transfer in foods during refrigeration (chilling and freezing). They have developed the modelling of air blast and vacuum cooling, chilling, cold chain, cold store, refrigerated room and refrigerated display cabinets. CFD simulation of heat and moisture transfer for predicting cooling rate and weight loss of cooked ham during air blast chilling process has been investigated. Both experimental and predicted results showed that the core temperature of the cooked ham was cooled down from 74.4 to 4°C within approximate 530 min. The experimental accumulative weight loss was 4.25%, while the simulated results were 4.07 and 4.22%, respectively, obtained from standard k–ε model and LRN k–ε model. At the same time the effect of fluctuation in inlet airflow temperature was studied, indicating that setting the boundary condition of airflow temperature is an important factor affecting the predicting accuracy. If a constant temperature was assumed for the inlet air, the weight loss (4.37%) was over predicted. Furthermore, the effects of different k–ε models and thermocouple positions on the prediction accuracy of CFD modelling of air-blast chilling process were also analyzed. Some developed a two-dimensional simulation model for the airflow in two industrial meat chillers. Recently, it was investigated the temperature increase in frozen food packaged in pallets in the distribution chain by means of CFD modelling. Good agreement was found between the experimental and modelling results with the differences normally within 10%. The study showed that the controlled temperature throughout the cold chain was necessary to ensure a high food quality with long storage duration. Although the modelling of air flow and temperature distribution has been well developed, models for phase transition, such as condensation and evaporation are not yet available. Crystallization It is one of the oldest unit operation in the chemical and food industry but the design and operation of crystallization processes still pose many problems. However, until recently, there have been few tools capable of providing the required capabilities. This is because modelling of crystallization processes poses a number of challenges. The key challenge is representing the inherent physical and chemical complexity of crystallization phenomena mathematically and validating the resulting mathematical model against experimental data. CFD helps in modelling of crystallization process and design of crystallizer154. Pasteurization Pasteurization is a vital unit operation which is used to inactivate the spoilage organisms and enzymes present in the milk. Similarly, CFD analysis for thermal pasteurization of intact eggs. Calculated temperature profiles were found to be in good agreement with experimentally observed data for eggs of different sizes. A generally accepted kinetic inactivation model for Salmonella enteritis’s was incorporated in the CFD analysis and provided a basis for process assessment. Minimum process times and temperatures to provide equivalent pasteurization effectiveness at 5 log reductions of the target microorganism were obtained on a theoretical basis. Combining a CFD analysis with inactivation kinetics proved to be a very useful approach for establishing process conditions leading to consumer safe eggs. Also, conducted in-package pasteurization for beer microbiological stabilization. A heating process was simulated at 60°C up to 15 PUs (a conventional beer process, in which 1 Pasteurization Unit (PU) is equivalent to 1minute at 60°C). The temperature profile and convection current velocity along the process and the variation of the PUs were evaluated

154 Kaushal and Sharma, “Concept of Computational Fluid Dynamics (CFD) and its Applications

Equipment Design”, J Food Process Technical 2012, 3:1.

in Food Processing

118

in relation to time considering the cans in the conventional, inverted, and horizontal positions. The package position did not result in process improvement.

Heat Exchangers A heat exchanger is a device built for efficient heat transfer from one medium to another. In order to predict and control food quality during heating process, CFD has been used to simulate and study the flow distribution and temperature distribution of fluid. The trend towards aseptic processing, combine with the aim of minimizing cooked flavors in heat processed products is leading heat exchangers to be constantly redesigned and improved. In this case, CFD can be used to optimize such redesign of heat exchangers. Traditionally, CFD analysis has been applied to simulate the flow of a fluid around obstacles and through hollow areas in order to control temperatures, reduce resistance to flow and/or optimize phenomena such as lift. [Khudheyer and Mahmoud]155 conducted threedimensional CFD simulations to investigate heat transfer and fluid flow characteristics of a two-row plain fin-and-tube heat exchanger using Open FOAM, an open-source CFD code. Heat transfer and pressure drop characteristics of the heat exchanger were investigated for Reynolds numbers ranging from 330 to 7000. The most accurate simulations for heat transfer in laminar flow are found using the laminar flow model, while heat transfer in transitional flow is best represented with the SST komega turbulence model, and heat transfer in turbulent flow is more accurately simulated with the k-epsilon turbulence model. Reasonable agreement was found between the simulations and experimental data, and the open-source software has been sufficient for simulating the flow fields in tube-fin heat exchangers.

CFD in Semiconductor Industry As with most of the technologies, one can know its applications only when it is invented156. Today CFD is being used to help in designing in every area where Fluid is involved. CFD has found its application with semiconductor industry as well. CFD solution can help immensely in reducing the number of experiments required to design various chip manufacturing equipment’s. After validation with experiments, one can find finer details more easily from CFD than with experiment e.g. temperature distribution over the surface, deposition rate, rate of desorption. Various semiconductor industries have started using CFD calculation to help their design engineers. But it still has a long way to go and gain confidence from everybody to its results. CFD (Computational Fluid Dynamics) could be used to model the thermal system at a board level as well as within a semiconductor chip, so that efficient heat-dissipation mechanisms and sufficient cooling systems could be designed around these systems. CFD could hold interesting possibilities given that we are now looking at three-dimensional (3D) transistor dies as well as multi-die two-dimensional (2D) packages. Heat dissipation is critical for the long-term reliability of semiconductor devices Brief Description of Semiconductor Devices Semiconductor devices are electronic components that exploit the electronic properties of semiconductor materials, principally silicon, germanium, and gallium arsenide, as well as organic semiconductors. Semiconductor devices have replaced thermionic devices (vacuum tubes) in most applications. They use electronic conduction in the solid state as opposed to the gaseous state or thermionic emission in a high vacuum. In layman terms, semiconductor is the category of conductors which besides being a conductor of current is also an insulator. As evident from the diagram (see Figure 7.1), the energy band gap between valence band and conduction band is:

Khudheyer AF, Mahmoud Sh (2011) “Numerical analysis of fin-tube plate heat exchanger by using CFD technique”, ARPN J Engineering and Applied Sciences 6: 1-7. 156 CFD online. 155

119

  

large in case of Insulator overlap in case of Metal moderate in case of Semiconductor

Figure 7.1

Illustrates the various classes of conductors

What are they you ask? 1. Valence Band - Band in which electrons reside. 2. Conduction Band - Band to which electrons jump and conduct electricity. 3. Energy Band - Band which does nothing! Similarly for an electron to jump to conduction band, it requires energy. Such an amount of energy is almost impossible Figure 7.2 Modern Semiconductor to provide to insulators but easily achievable in case of metals. In case of semiconductors energy can be tuned so as to make it work like metal or an insulator. The energy provided must be greater than the energy of band gap (>1eV). This feature enables semiconductors to be used as Switch. For switching ON the circuit you just need to provide energy greater than 1eV to the semiconductor device. Figure 7.2 demonstrates a semiconductor electronics. Thermal Management in Semiconductors There are at least ten good reasons to include thermal measurements as a routine step in any electronic component or system design process157. Amid all the promotion of solid-state superlatives ranging from data rate to feature size to LED light output, one characteristic is never touted: Junction Temperature. That's because Junction Temperature (JT) is an undesired but unavoidable side-effect of high currents and/or switching speeds. A p-n junction, whether it is one of millions on a CPU chip or the only one within a power LED, generates heat. In the past two decades the industry has seen heat dissipation increase by orders of magnitude. Faster is better, but faster is also hotter. This trend is not without consequences. A 10° increase in JT can cause a 50% reduction in a semiconductor device's life expectancy. In LEDs, both brightness and color can suffer as JT increases. And of course 157

From Mentor CFD Blogs.

120

the twin issues of safety and cooling can impact the design of an entire system, not just the semiconductor device producing the heat. All these facts point toward the need for a thorough grasp of thermal behaviors at the chip level, and beyond. True understanding comes with physical measurements performed on actual devices. This is especially true in the world of semiconductors. You see heat dissipation in semiconductor packages is one of the limiting factors in miniaturization. One of the biggest concerns of circuit designers is reducing power that is continuously increasing due to bandwidths. As a result, the chip temperature increases. This change first modifies and then later destroys the operation of the circuit if the heat is not correctly led out of the device. Being able to understand the true thermal characteristics of a chip that will go inside an enclosure which is jam packed with other heat generating equipment can be very helpful. While most Figure 7.3 Thermal Management of Semiconductor (courtesy of Mentor CFD) manufacturers publish thermal metrics for their chips, unfortunately not every manufacturer knows how to conduct an appropriate thermal characterization of their devices. So you can’t always rely on published metrics158. Can You Really Fry an Egg on a CPU? An interesting questions arises whether you can really fry an egg in CPUs. Believe or not, somebody already try that159. Solving complex thermal models with computational fluid dynamics (CFD) requires a lot of processing power, and a central processing unit (CPU) under full load generates a fair amount of heat. But can you cook an egg on it? This article describes the model, simulations, and the ultimate conclusion. Solving complex thermal models with CFD requires a lot of processing power and a CPU under full load generates a fair amount of heat. But can you cook an egg on it? Before you throw away your conventional heatsink and fan in favor of a multifunctional omelet, we’ll investigate what CFD to predict about the fate of your PC if you do so. (see Figure 7.4). Unfortunately, the Figure 7.4 An Example of an Egg Frying on a CPU CPU junction temperature exceeds 90°C within 6 seconds, at which point the CPU clock would throttle down to reduce the thermal power and prevent damage to the system; less than 158 159

From Mentor CFD Blogs. James Forsyth, System-Level Design, Semiconductor engineering.

121

ideal for a cooling solution. The egg would also burn and catch fire. The central location of the CPU on the board and the large obstacles to air flow in the neighboring memory DIMMS and I/O ports mean limited cold air can passively flow over the hot egg by natural convection. The passive cooling of the egg cannot match the forced convection of the stock cooler. An egg-based cooling solution would only keep the CPU below the maximum 90°C if the CPU performance were throttled down so there are only possible applications in lower power environments with plenty of ventilation. With the requirement of frequently swapping out the egg, it can’t this catching on. If the aim is to cook eggs though, CPUs certainly produce enough heat to do so; with thermal throttling, the processor acts as a thermostatically controlled surface at around 90°C, sufficient to cook on. If you value your computer, maybe consider buy a frying pan instead.

Magneto-Hydro-Dynamics (MHD) Magneto-hydro-dynamics (MHD; also magneto-fluid dynamics or hydro-magnetics) is the study of the magnetic properties of electrically conducting fluids. Examples of such magneto-fluids include plasmas, liquid metals, salt water and electrolytes. The word "magneto-hydro-dynamics" is derived from magneto- meaning magnetic field, hydro- meaning water, and -dynamics meaning movement160. In a nutshell, MHD is the study of electrically conducting fluids, combining both principles of fluid dynamics and electromagnetism. According to [Battista]161, the subject of MHD is traditionally studied as a continuum theory, that is to say, attempts at studying discrete particles in the flows are not at a level such that computation in these regards is realistic. To run “realistic simulations” would require computations of flows with many more particles than current computers are able to handle. Thus, the only way to study MHD seems to be in its continuum form- leading us to its description using the Navier-Stokes fluids equations162. MHD Equations The ideal MHD equations consist of the continuity equation, the Cauchy momentum equation, Ampere's Law neglecting displacement current, and a temperature evolution equation. As with any fluid description to a kinetic system, a closure approximation must be applied to highest moment of the particle distribution equation. This is often accomplished with approximations to the heat flux through a condition of adiabaticity or isothermally. The main quantities which characterize the electrically conducting fluid are the bulk plasma velocity field v, the current density J, the mass density ρ, and the plasma pressure p. The flowing electric charge in the plasma is the source of a magnetic field B and electric field E. All quantities generally vary with time t as described by Eq. 7.1. I. II. III. IV. V. VI. VII.

The two continuity equations for charge conservation where ρc = 0 because we are assuming the absence of an external charge distribution. The Cauchy momentum equation where the Lorentz force term J×B can be expanded using Ampere's law and the vector calculus identity where the first term on the right hand side is the magnetic tension force and the second term is the magnetic pressure force. The ideal Ohm's law for a plasma. Faraday's law. The low-frequency Ampere's law neglects displacement current. The magnetic divergence constraint. Energy equation where γ = 5/3 is the ratio of specific heats for an adiabatic equation of state. This energy equation is, of course, only applicable in the absence of shocks or heat conduction as it assumes that the entropy of a fluid element does not change.

From Wikipedia, the free encyclopedia. Nicholas A. Battista, “An Introduction to Magnetohydrodynamics”, Stony Brook University, December, 2010. 162 See Previous. 160 161

122

VIII.

Hartmann number (Ha) is the ratio of electromagnetic force to the viscous force first introduced by Hartmann where B is the magnetic field, L is the characteristic length scale, σ is the electrical conductivity, μ0 is the dynamic viscosity.

(I) (II) (III) (VI)

ρ C  .J  0 t  B2  (B.)B    ρ  v.  v  J where J  B     B  p  t μ 2μ   Lorentz Force 0  0 B E  v  B  0 (IV)    E (V) μ 0J    B t d p  σ  γ   0 .B  0 (VII) (VIII) Ha  BL dt  ρ  μ0

Eq. 7.1

The fundamental concept behind MHD is that magnetic fields can induce currents in a moving conductive fluid, which in turn polarizes the fluid and reciprocally changes the magnetic field itself. The set of equations that describe MHD are a combination of the NavierStokes equations of fluid dynamics and Maxwell's equations of electromagnetism (see Eq. 7.1). These differential equations must be solved simultaneously, either analytically or numerically. Figure 7.5 shows a RHR for forces in MHD. Figure 7.5 Right Hand Rule for MHD Case Study - Dynamics of a Q2D Wake Behind a Cylinder in presence of MHD Environment A confined laminar viscous flow past a two-dimensional bluff body in the presence of a strong uniform magnetic field is investigated by [Hamid, et al.]163. The effects of Reynolds number (Re) and Hartmann number (Ha) on the dynamics of the wake are examined, with a focus on the shedding frequency and the distribution of the wake vortices. These two parameters are of primary interest as they play an important role in determining the mixing and heat transfer properties of the downstream flow. The results indicates that the imposed magnetic field significantly alters the dynamic behavior of the wake behind a cylinder. It is well-known that beyond a critical Re, the flow around a circular cylinder generates a regular pattern of vortices known as the Karman vortex street. Analysis of such bluff body wakes are typically divided into three main focus areas: the correlation between drag coefficient, base pressure and 163 A. H. A. Hamid, W.

K. Hussam and G. J. Sheard, “Dynamics of a Quasi-Two-Dimensional Wake Behind a Cylinder in an MHD Duct Flow with a Strong Axial Magnetic Field”, 19th Australasian Fluid Mechanics Conference, Melbourne, Australia, 8-11 December 2014.

123

shedding frequency; the vortex dynamics, where the formation and re-arrangement process are addressed; and the stability of the mean velocity profile in the wake. When a strong magnetic fluid is imposed to a conducting fluid, the resulting wake possesses a distinct features as compared to the normal hydrodynamic flows. Typical example of such flows is in fusion power-reactor breeding blankets, where an electrically conducting fluid flows in channels within the blankets under a strong plasma-confining magnetic field. This class of flows are known as Magneto-Hydro-Dynamic (MHD). The interaction between induced electric currents and the applied magnetic field results in an electromagnetic Lorentz force, which in turn gives a damping effect to the flow and subsequently alters the formation of vortex street. 7.4.2.1 Numerical Method and Geometry In the current investigation a flow of electrically conducting fluid passing over a circular cylinder placed on the centerline of a duct is considered. Figure 7.6 depicts the numerical domain and the corresponding macro-element mesh. The ratio of cylinder diameter to the duct width (i.e. blockage ratio, b = d=2L) is fixed at 0.1 throughout this study. Also shown in the figure is a typical Hartmann velocity profile, characterized by a flat profile in the core with velocity U0 and high gradients in the vicinity of the lateral walls. The length scale is normalized by the half channel width, L. However, for the sake of discussions, the Re and the geometrical length in the succeeding discussions are presented in cylinder diameter scale, d. The use of two different length scales in an MHD cylinder wake flows is inevitable: the two-dimensional linear braking term is govern by Ha and L, whereas the Re and thus the structure of the cylinder wake is govern by d164. A quasi-two-dimensional (Q2D) model for MHD duct flow is employed165. Under this model, the non-dimensional magneto-hydro-dynamic equations of continuity and momentum reduce to

.u  0

,

u 1 Ha  (u.)u  p   2u  2 u t Re Re

Eq. 7.2

where u and p are the velocity and pressure fields, respectively. The governing equations are discretized using a high-order, in-house solver based on the spectral-element method.

Figure 7.6

Schematic diagram of numerical domain

Frank, M., Barleon, L. and M¨uller, U., 2001, “Visual analysis of two-dimensional magnetohydrodynamics”, Physics of Fluids, 13, 2287. 165 Sommeria, J. and Moreau, R., 1982, “Why, how, and when, MHD turbulence becomes two-dimensional”, Journal of Fluid Mechanics, 118, 507–518. 164

124

7.4.2.2 Result and Discussion In all simulations, two basic regions of wake vortices are apparent; a formation region in which the vorticity evolved from cylinder boundary-layers organizes into a vortex street, and a stable region in which the shed vortices convect downstream in a periodic laminar manner. This section presents the results of shedding frequency analysis and vortex distributions. In the current investigation, the effect of axial magnetic field on shedding frequency is of interest. It is to be noted that H = 0 correspond to hydrodynamic flows. The dimensionless frequency is represented by the Strouhal number, St = f d=U0, where f is shedding frequency, calculated from the fluctuating lift force imparted on the cylinder due to the nearwake flow unsteadiness. The Strouhal number is dependent on both Ha and Re. In the range of the Ha and Re Figure 7.7 Contour plots of vorticity snapshot at Red considered here, St increases with = 160 and at Hartmann number as indicated increasing Ha at a given Re. This observation can be attributed to the fact that the imposed magnetic field tends to stretch the shear layer at the near wake, and hence mass conservation requires that the wake advection velocity, Uw is increased. It can be seen in Figure 7.7 that stronger magnetic field intensity produces a narrower wake, thus extending the formation region behind the cylinder before the shear layer roll up into a vortex street. For detailed discussion, please see [[Hamid, et al.]166. In conclusion, The present study has investigated the characteristics of wakes behind a circular cylinders in a rectangular duct under a strong axial magnetic field using a spectral-element method. It is found that the formation of vortex shedding and the direction of the imposed magnetic field play significant roles in determining the shedding frequency. The present investigation reveals that an axial magnetic field tends to appreciably increase the St, regardless of flow Re. Furthermore, the advection speed of wake vortices is also a strong function of both Ha and Re, whereas Uw is only weakly dependent on Re for hydrodynamic flows.

166

See 125.

125

8 Modern Computer Architectures Background As we know, Computers and Software’s are one of the pillars of CFD and next two chapter are devoted to that. We argued that being a CFD analysis is not necessities a computer expert, nevertheless, knowing the essentials of it never hurts. (see Figure 8.1). So, it is wise to get familiar with modern computer architectures, as well as, software optimization, as details in [Severance & Dowd]167. Even if you can speed up the computational aspects of a processor infinitely fast, you still must load and store the data and instructions to and from a memory. Today's processors continue to creep ever closer to infinitely fast processing. But memory performance is increasing at a much slower rate (it will take longer for memory to become infinitely fast). Many of the interesting problems in high performance computing use a large amount of memory. As computers are getting faster, the size of problems they tend to operate on also goes up. The CFD trouble is that when you want to solve these problems at high speeds, you need a memory system that is large, yet at the same time fast; a big challenge. Possible approaches include the following: Figure 8.1 Contributions from other disciplines to CFD  Every memory system component can be made individually fast enough to respond to every memory access request.  Slow memory can be accessed in a round-robin fashion (hopefully) to give the effect of a faster memory system.  The memory system design can be made wide so that each transfer contains many bytes of information.  The system can be divided into faster and slower portions and arranged so that the fast portion is used more often than the slow one.

Again, economics are the dominant force in the computer business. A cheap, statistically optimized memory system will be a better seller than a prohibitively expensive, blazingly fast one, so the first choice is not much of a choice at all. But these choices, used in combination, can attain a good fraction of the performance you would get if every component were fast. Chances are very good that your high performance workstation incorporates several or all of them. Once the memory system has been decided upon, there are things we can do in software to see that it is used efficiently. A compiler that has some knowledge of the way memory is arranged and the details of the caches can optimize their use to some extent. The other place for optimizations is in user applications, as we'll see later in the 167

Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012.

126

book. A good pattern of memory access will work with, rather than against, the components of the system. Next, we discuss how the pieces of a memory system work. We look at how patterns of data and instruction access factor into your overall runtime, especially as CPU speeds increase. We also talk a bit about the performance implications of running in a virtual memory environment168.

Memory Technology Almost all fast memories used today are semiconductor-based169. They come in two flavors: Dynamic Random Access Memory (DRAM) and Static Random Access Memory (SRAM). The term random means that you can address memory locations in any order. This is to distinguishes from serial memories, where you have to step through all intervening locations to get to the particular one you are interested in. An example of a storage medium that is not random is magnetic tape. The terms dynamic and static have to do with the technology used in the design of the memory cells. DRAMs are charge-based devices, where each bit is represented by an electrical charge stored in a very small capacitor. The charge can leak away in a short amount of time, so the system has to be continually refreshed to prevent data from being lost. The act of reading a bit in DRAM also discharges the bit, requiring that it be refreshed. It's not possible to read the memory bit in the DRAM while it's being refreshed. SRAM is based on gates, and each bit is stored in four to six connected transistors. SRAM memories retain their data as long as they have power, without the need for any form of data refresh. DRAM offers the best price/performance, as well as highest density of memory cells per chip. This means lower cost, less board space, less power, and less heat. On the other hand, some applications such as cache and video memory require higher speed, to which SRAM is better suited. Currently, you can choose between SRAM and DRAM at slower speeds _ down to about 50 nanoseconds (ns). SRAM has access times down to about 7 ns at higher cost, heat, power, and board space. In addition to the basic technology to store a single bit of data, memory performance is limited by the practical considerations of the on-chip wiring layout and the external pins on the chip that communicate the address and data information between the memory and the processor. Memory Access Time The amount of time it takes to read or write a memory location is called the memory access time. Whereas the access time says how quickly you can reference a memory location, cycle time describes how often you can repeat references. They sound like the same thing, but they're not. For instance, if you ask for data from DRAM chips with a 50-ns access time, it may be 100 ns before you can ask for more data from the same chips. This is because the chips must internally recover from the previous access. Also, when you are retrieving data sequentially from DRAM chips, some technologies have improved performance. On these chips, data immediately following the previously accessed data may be accessed as quickly as 10 ns170. Memory Access Patterns The best pattern is the most straightforward: increasing and unit sequential. For an array with a single dimension, stepping through one element at a time will accomplish this. For multiplydimensioned arrays, access is fastest if you iterate on the array subscript offering the smallest stride or step size. In FORTRAN programs, this is the leftmost subscript; in C, it is the rightmost. The FORTRAN loop below has unit stride, and therefore will run quickly:

Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012. Magnetic core memory is still used in applications where radiation hardness resistance to changes caused by ionizing radiation is important. 170 Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012. 168 169

127

DO J = 1 , N DO I = 1 , N A (I , J) = B (I , J) + C (I ,J ) * D ENDDO ENDDO In contrast, the next loop is slower because its pace is N. As N increases from one to the length of the cache line (adjusting for the length of each element), the performance worsens. Once N is longer than the length of the cache line (again adjusted for element size), the performance won't decrease: DO J = 1 , N DO I =1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO Here's a unit-stride loop like the previous one, but written in C: for (I = 0 ; I < n ; i++) for (j = 0 ; j < n ; j++) a [ I ] [j ] = a [ i] [ j ] + c [ I ] [ j ] * d; 8.2.2.1 Loop Interchange to Ease Memory Access Patterns Loop interchange is a good technique for lessening the impact of stride memory references. Let's revisit our FORTRAN loop with non-unit stride. The good news is that we can easily interchange the loops; each iteration is independent of every other: DO J = 1 , N DO I = 1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO After interchange, A, B, and C are referenced with the leftmost subscript varying most quickly. This modification can make an important difference in performance. We traded three N-stride memory references for unit strides: DO I =1 , N DO J = 1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO Virtual Memory Virtual memory decouples the addresses used by the program (virtual addresses) from the actual addresses where the data is stored in memory (physical addresses). Your program sees its address space starting at 0 and working its way up to some large number, but the actual physical addresses assigned can be very different. It gives a degree of flexibility by allowing all processes to believe they have the entire memory system to themselves. Another trait of virtual memory systems is that they divide your program's memory up into pages chunks. Page sizes vary from 512 bytes to 1 MB or

128

larger, depending on the machine. Pages don't have to be allocated contiguously, though your program sees them that way. By being separated into pages, programs are easier to arrange in memory, or move portions out to disk.

Registers At least the top layer of the memory hierarchy, the CPU registers, operate as fast as the rest of the processor. The goal is to keep operands in the registers as much as possible. This is especially important for intermediate values used in a long computation such as: X = G * 2.41 + A / W - W * M While computing the value of A divided by W, we must store the result of multiplying G by 2.41. It would be a shame to have to store this intermediate result in memory and then reload it a few instructions later. On any modern processor with moderate optimization, the intermediate result is stored in a register. Also, the value W is used in two computations, and so it can be loaded once and used twice to eliminate a wasted load. Compilers have been very good at detecting these types of optimizations and efficiently making use of the available registers since the 1970s. Adding more registers to the processor has some performance benefit. It's not practical to add enough registers to the processor to store the entire problem data. So we must still use the slower memory technology.

Caches Once we go beyond the registers in the memory hierarchy, we Register 2 ns encounter caches. Caches are small amounts of SRAM that store a L1 on-Chip 4 ns subset of the contents of the memory. The hope is that the cache L2 on-Chip 5 ns will have the right subset of main memory at the right time. The L3 on-Chip 30 ns actual cache architecture has had to change as the cycle time of Memory 220 ns the processors has improved. The processors are so fast that offchip SRAM chips are not even fast enough. This has led to a Table 8.1 Memory Access multilevel cache approach with one, or even two, levels of cache Speed on a DEC Alpha implemented as part of the processor. Table 8.1 shows the approximate speed of accessing the memory hierarchy on a 500-MHz DEC Alpha. When every reference can be found in a cache, you say that you have a 100% hit rate. Generally, a hit rate of 90% or better is considered good for a level-one (L1) cache. In level-two (L2) cache, a hit rate of above 50% is considered acceptable. Below that, application performance can drop off steeply. One can characterize the average read performance of the memory hierarchy by examining the probability that a particular load will be satisfied at a particular level of the hierarchy. For example, assume a memory architecture with an L1 cache speed of 10 ns, L2 speed of 30 ns, and memory speed of 300 ns. If a memory reference were satisfied from L1 cache 75% of the time, L2 cache 20% of the time, and main memory 5% of the time, the average memory performance would be: (0.75 * 10 ) + ( 0.20 * 30 ) + ( 0.05 * 300 ) = 28.5 ns You can easily see why it's important to have an L1 cache hit rate of 90% or higher. Given that a cache holds only a subset of the main memory at any time, it's important to keep an index of which areas of the main memory are currently stored in the cache. To reduce the amount of space that must be dedicated to tracking which memory areas are in cache, the cache is divided into a number of equal sized slots known as lines. Each line contains some number of sequential main memory locations, generally four to sixteen integers or real numbers. Whereas the data within a line comes from the

129

same part of memory, other lines can contain data that is far separated within your program, or perhaps data from somebody else's program, as in Figure 8.2 (Cache lines can come from different parts of memory). When you ask for something from memory, the computer checks to see if the data is available within one of these cache lines. If it is, the data is returned with a minimal delay. If it's not, your program may be delayed while a new line is fetched from main memory. Of course, if a new line is brought in, another has to be thrown out. If you're lucky, it won't be the one containing the

Figure 8.2

Cache Lines can come from Different Parts of Memory

data you are just about to need. On multiprocessors (several CPUs), written data must be returned to main memory so the rest of the processors can see it, or all other processors must be made aware of local cache activity. The problem can become very complex in a multiprocessor system. Caches are effective because programs often exhibit characteristics that help keep the hit rate high. This is called unit stride because the address of each successive data element is incremented by one and all the data retrieved into the cache is used. The following loop is a unit-stride loop: DO I = 1 , 1000000 SUM = SUM + A (I) END DO When a program accesses a large data structure using non-unit stride, performance suffers because data is loaded into cache that is not used. For example: DO I = 1 , 1000000 , 8 SUM = SUM + A (I) END DO This code would experience the same number of cache misses as the previous loop, and the same amount of data would be loaded into the cache. However, the program needs only one of the eight 32-bit words loaded into cache. Even though this program performs one-eighth the additions of the previous loop, its elapsed time is roughly the same as the previous loop because the memory operations dominate performance. While this example may seem a bit contrived, there are several situations in which non-unit strides occur quite often. First, when a FORTRAN two-dimensional array is stored in memory, successive elements in the first column are stored sequentially followed by the

130

elements of the second column. If the array is processed with the row iteration as the inner loop, it produces a unit-stride reference pattern as follows: REAL*4 A (200 , 200) DO J = 1 , 200 DO I = 1 , 200 SUM = SUM + A (I , J) END DO END DO Interestingly, a FORTRAN programmer would most likely write the loop (in alphabetical order) as follows, producing a non-unit stride of 800 bytes between successive load operations: REAL*4 A (200 , 200) DO I = 1 , 200 DO J = 1 , 200 SUM = SUM + A (I , J) END DO END DO Because of this, some compilers can detect this suboptimal loop order and reverse the order of the loops to make best use of the memory system. As we will see, however, this code transformation may produce different results, and so you may have to give the compiler _permission_ to interchange these loops in this particular example (or, after reading this book, you could just code it properly in the first place). while ( ptr ! = NULL ) ptr = ptr -> next; The next element that is retrieved is based on the contents of the current element. This type of loop bounces all around memory in no particular pattern. This is called pointer chasing and there are no good ways to improve the performance of this code. A third pattern often found in certain types of codes is called gather (or scatter) and occurs in loops such as: SUM = SUM + ARR ( IND (I)) where the IND array contains offsets into the ARR array. Again, like the linked list, the exact pattern of memory references is known only at runtime when the values stored in the IND array are known. Some special-purpose systems have special hardware support to accelerate this particular operation. Cache Organization The process of pairing memory locations with cache lines is called mapping. Of course, given that a cache is smaller than main memory, you have to share the same cache lines for different memory locations. In caches, each cache line has a record of the memory address (called the tag) it represents and perhaps when it was last used. The tag is used to track which area of memory is stored in a particular cache line. The way memory locations (tags) are mapped to cache lines can have a beneficial effect on the way your program runs, because if two heavily used memory locations map onto the same cache line, the miss rate will be higher than you would like it to be. Caches can be organized in one of several ways: direct mapped, fully associative, and set associative.

131

8.4.1.1 Direct-Mapped Cache Direct mapping, as presented in Figure 8.3 (Many memory addresses map to the same cache line), is the simplest algorithm for deciding how memory maps onto the cache. Say, for example, that your computer has a 4-KB cache. In a direct mapped scheme, memory location 0 maps into cache location 0, as do memory locations 4K, 8K, 12K, etc. In other words, memory maps onto the cache size. Another way to think about it is to imagine a metal spring with a chalk line marked down the side. Every time around the spring, you encounter the chalk line at the same place modulo the circumference of the spring. If the spring is very long, the chalk line crosses many coils, the analog being a large memory with many locations mapping into the same cache line. Problems occur when alternating runtime memory references in a direct-mapped cache point to the same cache line. Each reference causes a cache miss and replaces the entry just replaced, causing a lot of overhead. The popular word for this is thrashing. When there is lots of thrashing, a cache can be more of a liability than an asset because each cache miss requires that a cache line be refilled an operation that moves more data than merely satisfying the reference directly from main memory. It is easy to construct a pathological case that causes thrashing in a 4-KB direct-mapped cache: REAL*4 A(1024), B(1024) COMMON /STUFF/ A , B DO I=1,1024 A(I) = A(I) * B(I) END DO The arrays A and B both take up exactly 4 KB of storage, and their inclusion together in COMMON assures that the arrays start exactly 4 KB apart in memory. In a 4-KB direct mapped cache, the same line that is used for A(1) is used for B(1), and likewise for A(2) and B(2), etc., so alternating references cause repeated cache misses. To fix it, you could either adjust the size of the array A, or put some other variables into COMMON, between them. For this reason one should generally avoid array dimensions that are close to powers of two.

Figure 8.3

Many memory addresses map to the same cache line

132

8.4.1.2 Fully Associative Cache At the other extreme from a direct mapped cache is a fully associative cache, where any memory location can be mapped into any cache line, regardless of memory address. Fully associative caches get their name from the type of memory used to construct them _ associative memory. Associative memory is like regular memory, except that each memory cell knows something about the data it contains. When the processor goes looking for a piece of data, the cache lines are asked all at once whether any of them has it. The cache line containing the data holds up its hand and says _I have it_; if none of them do, there is a cache miss. It then becomes a question of which cache line will be replaced with the new data. Rather than map memory locations to cache lines via an algorithm, like a direct- mapped cache, the memory system can ask the fully associative cache lines to choose among themselves which memory locations they will represent. Usually the least recently used line is the one that gets overwritten with new data. The assumption is that if the data hasn't been used in quite a while, it is least likely to be used in the future. Fully associative caches have superior utilization when compared to direct mapped caches. It's difficult to find real-world examples of programs that will cause thrashing in a fully associative cache. The expense of fully associative caches is very high, in terms of size, price, and speed. The associative caches that do exist tend to be small. 8.4.1.3 Set-Associative Cache Now imagine that you have two direct mapped caches sitting side by side in a single cache unit as shown in Figure 8.4 (Two-way set-associative cache). Each memory location corresponds to a particular cache line in each of the two direct-mapped caches. The one you choose to replace during a cache miss is subject to a decision about whose line was used last the same way the decision was made in a fully associative cache except that now there are only two choices. This is called a setassociative cache. Set-associative caches generally come in two and four separate banks of cache. These are called two-way and four-way set associative caches, respectively. Of course, there are benefits and drawbacks to each type of cache. A set associative cache is more immune to cache thrashing than a direct-mapped cache of the same size, because for each mapping of a memory address into a cache line, there are two or more choices where it can go. The beauty of a directmapped cache, however, is that it's easy to implement and, if made large enough, will perform roughly as well as a set-associative design. Your machine may contain multiple caches for several different purposes. Here's a little program for causing thrashing in a 4-KB two-way set- associative cache: REAL*4 A(1024), B(1024), C(1024) COMMON /STUFF/ A,B,C DO I=1,1024 A(I) = A(I) * B(I) + C(I) END DO Like the previous cache thrasher program, this forces repeated accesses to the same cache lines, except that now there are three variables contending for the choose set same mapping instead of two. Again, the way to fix it would be to change the size of the arrays or insert something in between them, in COMMON. By the way, if you accidentally arranged a program to thrash like this, it would be hard for you to detect it; aside from a feeling that the program runs a little slow. Few vendors provide tools for measuring cache misses.

133

Figure 8.4

Two-Way Set-Associative Cache

8.4.1.4 Instruction Cache So far we have glossed over the two kinds of information you would expect to find in a cache between main memory and the CPU: instructions and data. But if you think about it, the demand for data is separate from the demand for instructions. In superscalar processors, for example, it's possible to execute an instruction that causes a data cache miss alongside other instructions that require no data from cache at all, i.e., they operate on registers. It doesn't seem fair that a cache miss on a data reference in one instruction should keep you from fetching other instructions because the cache is tied up. Furthermore, a cache depends on locality of reference between bits of data and other bits of data or instructions and other instructions, but what kind of interplay is there between instructions and data? It would seem possible for instructions to bump perfectly useful data from cache, or vice versa, with complete disregard for locality of reference. Many designs from the 1980s used a single cache for both instructions and data. But newer designs are employing what is known as the Harvard Memory Architecture, where the demand for data is segregated from the demand for instructions. Main memory is a still a single large pool, but these processors have separate data and instruction caches, possibly of different designs. By providing two independent sources for data and instructions, the aggregate rate of information coming from memory is increased, and interference between the two types of memory references is minimized. Also, instructions generally have an extremely high level of locality of reference because of the sequential nature of most programs. Because the instruction caches don't have to be particularly large to be effective, a typical architecture is to have separate L1 caches for instructions and data and to have a combined L2 cache. For example, the IBM/Motorola PowerPC 604e has separate 32-K fourway set-associative L1 caches for instruction and data and a combined L2 cache.

Timing a Program Under UNIX, you can time program execution by placing the time command before everything else you normally type on the command line. When the program finishes, a timing summary is produced. For instance, if your program is called foo, you can time its execution by typing time foo. If you are

134

using the C shell or Korn shell, time is one of the shell's built-in commands. With a Bourne shell, time is a separate command executable in /bin. In any case, the following information appears at the end of the run:   

User time System time Elapsed time

These timing figures are easier to understand with a little background. As your program runs, it switches back and forth between two fundamentally different modes: user mode and kernel mode. The normal operating state is user mode. It is in user mode that the instructions the compiler generated on your behalf get executed, in addition to any subroutine library calls linked with your program.20 It might be enough to run in user mode forever, except that programs generally need other services, such as I/O, and these require the intervention of the operating system the kernel. A kernel service request made by your program, or perhaps an event from outside your program, causes a switch from user mode into kernel mode. Time spent executing in the two modes is accounted for separately. The user time figure describes time spent in user mode. Similarly, system time is a measure of the time spent in kernel mode. As far as user time goes, each program on the machine is accounted for separately. That is, you won't be charged for activity in somebody else's application. System time accounting works the same way, for the most part; however, you can, in some instances, be charged for some system services performed on other people's behalf, in addition to your own. Incorrect charging occurs because your program may be executing at the moment some outside activity causes an interrupt. This seems unfair, but take consolation in the fact that it works both ways: other users may be charged for your system activity too, for the same reason. Taken together, user time and system time are called CPU time. Generally, the user time is far greater than the system time. You would expect this because most applications only occasionally ask for system services. In fact, a disproportionately large system time probably indicates some trouble. For instance, programs that are repeatedly generating exception conditions, such as page faults, misaligned memory references, or floating-point exceptions, use an inordinate amount of system time. Time spent doing things like seeking on a disk, rewinding a tape, or waiting for characters at the terminal doesn't show up in CPU time. That's because these activities don't require the CPU; the CPU is free to go o_ and execute other programs. The third piece of information (corresponding to the third set of hands on the watch), elapsed time, is a measure of the actual (wall clock) time that has passed since the program was started. For programs that spend most of their time computing, the elapsed time should be close to the CPU time. Reasons why elapsed time might be greater are:    

You are timesharing the machine with other active programs171 Your application performs a lot of I/O. Your application requires more memory bandwidth than is available on the machine. Your program was paging or swapped.

People often record the CPU time and use it as an estimate for elapsed time. Using CPU time is okay on a single CPU machine, provided you have seen the program run when the machine was quiet and noticed the two numbers were very close together. But for multiprocessors, the total CPU time can be far different from the elapsed time. Whenever there is a doubt, wait until you have the machine to The uptime command gives you a rough indication of the other activity on your machine. The last three fields tell the average number of processes ready to run during the last 1, 5, and 15 minutes, respectively. 171

135

your- self and time your program then, using elapsed time. It is very important to produce timing results that can be verified using another run when the results are being used to make important purchasing decisions. If you are running on a Berkeley UNIX derivative, the C shell's built-in time command can report a number of other useful statistics. Check with your csh manual page for more possibilities. In addition to figures for CPU and elapsed time, csh time command produces information about CPU utilization, page faults, swaps, blocked I/O operations (usually disk activity), and some measures of how much physical memory our pro- gram occupied when it ran. We describe each of them in turn. Timing a Portion of the Program For some benchmarking or tuning efforts, measurements taken on the outside of the program tell you everything you need to know. But if you are trying to isolate performance figures for individual loops or portions of the code, you may want to include timing routines on the inside too. The basic technique is simple enough: 1. 2. 3. 4.

Record the time before you start doing X. Do X. Record the time at completion of X. Subtract the start time from the completion time.

If, for instance, X's primary job is to calculate particle positions, divide by the total time to obtain a number for particle positions/second. You have to be careful though; too many calls to the timing routines, and the observer becomes part of the experiment. The timing routines take time too, and their very presence can increase instruction cache miss or paging. Furthermore, you want X to take a significant amount of time so that the measurements are meaningful. Paying attention to the time between timer calls is really important because the clock used by the timing functions has a limited resolution. An event that occurs within a fraction of a second is hard to measure with any accuracy. Getting Time Information In this section, we discuss methods for getting various timer values during the execution of your program. For FORTRAN programs, a library timing function found on many machines is called etime, which takes a two-element REAL*4 array as an argument and fills the slots with the user CPU time and system CPU time, respectively. The value returned by the function is the sum of the two. Here's how etime is often used: real*4 tarray(2), etime real*4 start, finish start = etime(tarray) finish = etime(tarray) write (*,*) 'CPU time: ', finish – start Not every vendor supplies an etime function; in fact, one doesn't provide a timing routine for FORTRAN at all. Try it first. If it shows up as an undefined symbol when the program is linked, you can use the following C routine. It provides the same functionality as etime: #include #define TICKS 100. float etime (parts) struct { float user;

136

float system; } *parts; { struct tms local; times (&local); parts->user= (float) local.tms_utime/TICKS; parts->system = (float) local.tms_stime/TICKS; return (parts->user + parts->system); } There are a couple of things you might have to tweak to make it work. First of all, linking C routines with FORTRAN routines on your computer may require you to add an underscore (_) after the function name. This changes the entry to float etime (parts). Furthermore, you might have to adjust the TICKS parameter. We assumed that the system clock had a resolution of 1/100 of a second (true for the Hewlett-Packard machines that this version of etime was written for). 1/60 is very common. On an RS-6000 the number would be 1000. You may find the value in a file named /usr/include/sys/param.h on your machine, or you can determine it empirically. A C routine for retrieving the wall time using calling get time of day is shown below: #include #include #include void hpcwall(double *retval) { static long zsec = 0; static long zusec = 0; double esec; struct timeval tp; struct timezone tzp; gettimeofday(&tp, &tzp); if ( zsec == 0 ) zsec = tp.tv_sec; if ( zusec == 0 ) zusec = tp.tv_usec; *retval = (tp.tv_sec - zsec) + (tp.tv_usec - zusec ) * 0.000001 ; } void hpcwall_(double *retval) { hpcwall(retval); } /* Other convention */ Given that you will often need both CPU and wall time, and you will be continually computing the difference between successive calls to these routines, you may want to write a routine to return the elapsed wall and CPU time upon each call as follows:

Subroutine Profiling Sometimes you want more detail than the overall timing of the application. But you don't have time to modify the code to insert several hundred etime calls into your code. Profiles are also very useful when you have been handed a strange 20,000-line application program and told to figure out how it works and then improve its performance. Most compilers provide a facility to automatically insert timing calls into your code at the entry and exit of each routine at compile time. While your program runs, the entry and exit times are recorded and then dumped into a file. A separate utility summarizes the execution patterns and produces a report that shows the percentage of the time spent in each of your routines and the library routines. The profile gives you a sense of the shape of the execution

137

profile. That is, you can see that 10% of the time is spent in subroutine A, 5% in subroutine B, etc. Naturally, if you add all of the routines together they should account for 100% of the overall time spent. From these percentages you can construct a picture a profile of how execution is distributed when the program runs. Though not representative of any particular profiling tool, the histograms in Figure 8.5-left (Sharp profile) and Figure 8.5-right (Flat profile ) depict these percentages, sorted from left to right, with each vertical column representing a different routine. They help illustrate different profile shapes. A sharp profile says that most of the time is spent in one or two procedures, and if you want to improve the program's performance you should focus your efforts on tuning those procedures. A minor optimization in a heavily executed line of code can sometimes have a great effect on the overall runtime, given the right opportunity. A at profile on the other hand, tells you that the runtime is spread across many routines, and effort spent optimizing any one or two will have little benefit in speeding up the program. Of course, there are also programs whose execution profile falls somewhere in the middle.

Figure 8.5

Sharp Profiling (right) vs. Flat Profiling (right)

We cannot predict with absolute certainty what you are likely to find when you profile your programs, but there are some general trends. For instance, engineering and scientific codes built around matrix solutions often exhibit very sharp profiles. The runtime is dominated by the work performed in a handful of routines. To tune the code, you need to focus your efforts on those routines to make them more efficient. It may involve restructuring loops to expose parallelism, providing hints to the compiler, or rearranging memory references. In any case, the challenge is tangible; you can see the problems you have to x. There are limits to how much tuning one or two routines will improve your runtime, of course. An often quoted rule of thumb is Amdahl's Law, derived from remarks made in 1967 by one of the designers of the IBM 360 series, and founder of Amdahl Computer, Gene Amdahl. Strictly speaking, his remarks were about the performance potential of parallel computers, but people have adapted Amdahl's Law to describe other things too. For our purposes, it goes like this: Say you have a program with two parts, one that can be optimized so that it goes infinitely fast and another that can't be optimized at all. Even if the optimizable portion makes up 50% of the initial runtime, at best you will be able to cut the total runtime in half. That is, your runtime will eventually be dominated by the portion that can't be optimized. This puts an upper limit on your expectations when tuning. Even given the finite return on effort suggested by Amdahl's Law, tuning a program with a sharp profile can be rewarding. Programs with _at profiles are much more difficult to tune. These are often

138

system codes, nonnumeric applications, and varieties of numerical codes without matrix solutions. It takes a global tuning approach to reduce, to any justifiable degree, the runtime of a program with a at profile. For instance, you can sometimes optimize instruction cache usage, which is complicated because of the program's equal distribution of activity among a large number of routines. It can also help to reduce subroutine call overhead by folding call lees into callers. Occasionally, you can find a memory reference problem that is endemic to the whole program and one that can be fixed all at once. When you look at a profile, you might find an unusually large percentage of time spent in the library routines such as log, exp, or sin. Often these functions are done in software routines rather than inline. You may be able to rewrite your code to eliminate some of these operations. Another important pattern to look for is when a routine takes far longer than you expect. Unexpected execution time may indicate you are accessing memory in a pattern that is bad for performance or that some aspect of the code cannot be optimized properly. In any case, to get a profile, you need a profiler. One or two subroutine profilers come standard with the software development environments on all UNIX machines. We confer two of them: prof and gprof. In addition, we mention a few line-by-line profilers. Subroutine profilers can give you a general overall view of where time is being spent. You probably should start with prof, if you have it (most machines do). Otherwise, use gprof. After that, you can move to a line-by-line profiler if you need to know which statements take the most time.

Loop Optimizations In nearly all high performance applications, loops are where the majority of the execution time is spent. In this chapter we focus on techniques used to improve the performance of these _clutter-free_ loops. Sometimes the compiler is clever enough to generate the faster versions of the loops, and other times we have to do some rewriting of the loops ourselves to help the compiler. It's important to remember that one compiler's performance enhancing modifications are another compiler's clutter. When you make modifications in the name of performance you must make sure you're helping by testing the performance with and without the modifications. Also, when you move to another architecture you need to make sure that any modifications aren't hindering performance. For this reason, you should choose your performance-related modifications wisely. You should also keep the original (simple) version of the code for testing on new architectures. Also if the benefit of the modification is small, you should probably keep the code in its most simple and clear form. The different loop optimization techniques, includes:      

Loop unrolling Nested loop optimization Loop interchange Memory reference optimization Blocking Out-of-core solutions

Someday, it may be possible for a compiler to perform all these loop optimizations automatically. Typically loop unrolling is performed as part of the normal compiler optimizations. Other optimizations may have to be triggered using explicit compile-time options. As you contemplate making manual changes, look carefully at which of these optimizations can be done by the compiler. Also run some tests to determine if the compiler optimizations are as good as hand optimizations. Operation Counting Before you begin to rewrite a loop body or reorganize the order of the loops, you must have some idea of what the body of the loop does for each iteration. Operation counting is the process of

139

surveying a loop to understand the operation mix. You need to count the number of loads, stores, floating-point, integer, and library calls per iteration of the loop. From the count, you can see how well the operation mix of a given loop matches the capabilities of the processor. Of course, operation counting doesn't guarantee that the compiler will generate an efficient representation of a loop. But it generally provides enough insight to the loop to direct tuning efforts. Bear in mind that an instruction mix that is balanced for one machine may be imbalanced for another. Processors on the market today can generally issue some combination of one to four operations per clock cycle. Address arithmetic is often embedded in the instructions that reference memory. Because the compiler can replace complicated loop address calculations with simple expressions (provided the pattern of addresses is predictable), you can often ignore address arithmetic when counting operations. Let's look at a few loops and see what we can learn about the instruction mix: DO I=1,N A(I,J,K) = A(I,J,K) + B(J,I,K) ENDDO This loop contains one floating-point addition and three memory references (two loads and a store). There are some complicated array index expressions, but these will probably be simplified by the compiler and executed in the same cycle as the memory and floating-point operations. For each iteration of the loop, we must increment the index variable and test to determine if the loop has completed. A 3:1 ratio of memory references to floating-point operations suggests that we can hope for no more than 1/3 peak floating-point performance from the loop unless we have more than one path to memory. That's bad news, but good information. The ratio tells us that we ought to consider memory reference optimizations first. The loop below contains one floating-point addition and two memory operations a load and a store. Operand B(J) is loop-invariant, so its value only needs to be loaded once, upon entry to the loop: DO I=1,N A(I) = A(I) + B(J) ENDDO Again, our floating-point throughput is limited, though not as severely as in the previous loop. The ratio of memory references to floating-point operations is 2:1. The next example shows a loop with better prospects. It performs element-wise multiplication of two vectors of complex numbers and assigns the results back to the first. There are six memory operations (four loads and two stores) and six floating-point operations (two additions and four multiplications): for (i=0; i n) may be encountered. Usually this is caused by either using a suboptimal sequential

161

algorithm or some unique specification of the hardware architecture that favors the parallel computation. For example, one common reason for super linear speedup is the extra memory in the multiprocessor system. The speedup of any parallel computing environment obeys the Amdahl's Law. Amdahl's law states that if F is the fraction of a calculation that is sequential (i.e. cannot benefit from parallelization), and (1−F) is the fraction that can be parallelized, then the maximum speedup that can be achieved by using N processors is,

η

1 F  (1  F) / N

Eq. 9.2

In the limit, as N tends to infinity, the maximum speedup (η) tends to 1/F. In practice, performance ratio falls rapidly as N is increased once (1 − F)/N is small compared to F. As an example, if F is only 10%, the problem can be sped up by only a maximum of a factor of 10, no matter how large the value of N used. For this reason, parallel computing is only useful for either small numbers of processors, or problems with very low values of F: so-called embarrassingly parallel problems. A great part of the craft of parallel programming consists of attempting to reduce F to the smallest possible value (Figure 9.4). The Amdahl’s law indicates, the maximum you can speed up any code is limited by the amount that can be effectively parallelized. In other words: You are limited by the mandatory serial portions of your code179.

Figure 9.4

Amdahl's Law

For example, suppose 70% of a program can be sped up if parallelized and run on multiple CPUs instead of one. If is the fraction of a calculation that is sequential, and is the fraction that can be 179 Paul Edmon,”

Introduction to Parallel Programming and MPI “, FAS Research Computing, Harvard University.

162

parallelized, the maximum speedup that can be achieved by using P processors is given according to Amdahl's Law. Substituting the value for this example, using 4 processors we get, 2.105. Doubling the processors to 8 we get, 2.581. So in this case, doubling the processing power has only improved the speedup by roughly one-fifth. If the whole problem was parallelizable, we would, of course, expect the speed up to double also. Therefore, throwing in more hardware is not necessarily the optimal approach. Weak vs. Strong Scaling In the context of high performance computing there are two common notions of scalability: 



The first is strong scaling, which is defined as how the solution time varies with the number of processors for a fixed total problem size. Or, execution time decreases in inverse proportion to the number of processors. Figure 9.5 displays an example of strung scaling used in NASA application FUN3D (M6 wing results below180). The second is weak scaling, which is defined as how the solution time varies with the number of processors for a fixed problem size per processor. Alternatively, execution time remains constant, as problem size and processor number are increased in proportion.

Figure 9.5

Example of Strong Scalability

Scalability vs. Performance First it’s critical for readers to understand the fundamental difference between scalability and performance. While the two are frequently conflated, they are quite different. Performance is the capability of particular component to provide a certain amount of capacity. Scalability, in contrast, is about the ability of a system to expand to meet demand. This is quite frequently measured by looking at the aggregate performance of the individual components of a particular system and how they function over time. Put more simply, performance measures the capability of a single part of a large system while scalability measures the ability of a large system to grow to meet growing demand. Scalable systems may have individual parts that are relatively low performing. I have heard that the 180

David E. Keyes, “Domain Decomposition Methods for Partial Differential Equations”, Columbia University.

163

Amazon.com retail website’s web servers went from 300 transactions per second (TPS) to a mere 3 TPS each after moving to a more scalable architecture. The upside is that while every web server might have lower individual performance, the overall system became significantly more scalable and new web servers could be added ad infinitum. Most x86 clusters today are built out for very high performance and scalability, but with a particular focus on performance of individual components (servers) and the interconnect network181. Load Balancing A load balancer is a device that acts as a reverse proxy and distributes network or application traffic across a number of servers. Load balancers are used to increase capacity (concurrent users) and reliability of applications. They improve the overall performance of applications by decreasing the burden on servers associated with managing and maintaining application and network sessions, as well as by performing application-specific tasks. Load balancers are generally grouped into two categories: Layer 4 and Layer 7. Layer 4 load balancers act upon data found in network and transport layer protocols (IP, TCP, FTP, UDP). Layer 7 load balancers distribute requests based upon data found in application layer protocols such as HTTP. Requests are received by both types of load balancers and they are distributed to a particular server based on a configured algorithm. Some industry standard algorithms are:    

Round robin Weighted round robin Least connections Least response time

Load balancers ensure reliability and availability by monitoring the "health" of applications and only sending requests to servers and applications that can respond in a timely manner.

Performance of CFD Codes The method used to assess the performance of a parallel CFD solver is becoming a topic for debate. While some implementations use a fixed number of outer iterations to assess the performance of the parallel solver regardless of whether a solution has been obtained or not, other implementers use a fixed value for the residual as a basis for evaluation. Ironically, a large amount of implementers do not mention the method used in their assessment! The reason for this discrepancy is that the first group (who uses a fixed number of outer iterations) believes that the evaluation of the parallel performance should be done using exactly the same algorithm which justifies the use of a fixed number of outer iterations. This can be acceptable from an algorithmic point of view. The other group (who uses a fixed value for the maximum residual) believes that the evaluation of the parallel performance should be done using the converged solution of the problem which justifies the use of the maximum residual as a criterion for performance measurement. This is acceptable from an engineering point of view and from the user point of view. In all cases, the parallel code will be used to seek a valid solution! Now if the number of outer iterations is the same as that of the sequential version, tant mieux! The problem becomes more complicated when an algebraic multigrid solver is used. Depending on the method used in implementing the AMG solver, the maximum number of AMG levels in the parallel version will usually be less than that of the sequential version which raises the issue that one is not comparing the same algorithm. From an engineering point of view, the main concern is to obtain a valid solution for a given problem in a reasonable amount of time and thus, a

181

Randy Bias, “Grid, Cloud, HPC ... What's the Diff? “, Posted on Cloud scaling Blog, 2010.

164

user will not actually perform a sequential run and then a parallel run; rather, she will require the code to use as many AMG levels as possible. CFD for Next Generation High Performance Computing High Performance Computing (HPC) is moving towards large scale parallelism. The Jaguar supercomputer, which is currently the fastest computer in the world, has over 200,000 processing cores. On chip parallelism has been increasing in regular processors (dual core, quad core, etc.) since 2001, but now larger scales of parallelism are being seen on a single chip. The introduction of Graphics Processing Units (GPUs), which have hundreds of cores on a single chip, into HPC represents a large change in the architectures being used for scientific computing. The scale of parallelism and new architectures requires novel numerical solvers to be written and optimized for solving CFD problems. Hardware Consideration and CPU vs. GPU Technology GPUs have been traditionally used for rendering graphics, in which several relatively simple operations are performed identically on all parts of the input to produce an output image182. The nature of this work makes graphics rendering a good candidate for parallelization, GPUs reflect this in their architecture by having many cores. The differences between CPU and GPU architecture can be summarized in four points, and visualize at Figure 9.6:

Figure 9.6

   

Architecture differences between CPU and GPU

CPUs contain few cores but have a relatively large cache (several Mbs). CPUs have many advanced features such as predictive branching, out of order execution and deep pipelines to improve throughput. GPUs have hundreds of cores split into groups which share control hardware and high speed memory (equivalent to cache). High speed memory is very small (a few Kbs). GPU cores are 'lightweight', i.e. they lack the advanced features of CPU cores.

Mark Mawson, Alistair Revell & Robert Prosser, “Computational Fluid Dynamics Codes For Next Generation High Performance Computing”, Computational Fluid Dynamics Group, University of Manchester. 182

165

9.5.2.1 Case Study 1 – 2D Laplace Equation A V-Cycle Multigrid method was written for GPUs to solve a 2D Laplace problem (Δu = 0). The Multigrid method solves a system of linear equations by restricting the error of the initial fine grid solution to increasingly coarse grids, and performing smoothing functions on them. It has been shown that higher frequency errors are more susceptible to smoothing operations than low frequency errors. By restricting the solution to coarser grids the relative frequency of the errors increases, making them susceptible to smoothing. Each level of coarseness will allow a low frequency error component to be smoothed, the solution at each grid level is then summed with the level above and further smoothing carried out to remove any errors introduced by the summation. The result is a solution that is smoothed across all frequencies of error. Figure 9.7 Results for V-Cycle Multigrid 9.5.2.2 Results GPU and CPU implementations of the VCycle Multigrid Method were tested on grids of size up to 4097×4097 elements. The maximum grid size was limited by the size of RAM on the GPU (4GB), future work will include allowing partition as of larger grids to be moved to and from the GPU. The GPU implementation performed up to 12× faster than the CPU version. 9.5.2.3

8.5.3 Future Work – Heterogeneous Computing In the V-cycle method shown, the CPU is idle while functions run on the GPU. The principle of heterogeneous computing is that the CPU will perform other tasks while the GPU is being used. For CFD applications this could include using the Figure 9.8 Heterogeneous Computing using CPUs and CPU to assist with the solver or, for time GPUs dependent problems, post processing of the previous time step while the GPU calculates the latest time step (see Figure 9.8).

166

Case Study 2 - Unstructured Grid Based CFD Solvers on Modern Graphics Hardware The 3D Euler equations for inviscid, compressible flow are considered by [Corrigan et. al.]183. Effective memory bandwidth is improved by reducing total global memory access and overlapping redundant computation, as well as using an appropriate numbering scheme and data layout. The applicability of per-block shared memory is also considered. The performance of the solver is demonstrated on two benchmark cases: a missile and the NACA0012 wing. For a variety of mesh sizes, an average speed-up factor of roughly 9.5X is observed over the equivalent parallelized OpenMP code running on a quad-core CPU, and roughly 33x over the equivalent code running in serial. 9.5.3.1 Background and Literature Survey Recently, GPUs (Graphics Processing Units) have seen a tremendous increase in performance, In addition to this high computational performance, the latest modern graphics hardware offers increasing memory capacity, as well as support for 64-bit floating point arithmetic. Parallel, multicore processors, GPUs offer tremendous potential for applications in computational fluid dynamics. In order to fully exploit the computational power of such hardware, considerable care is required in the coding and implementation, particularly in the memory access pattern. GPUs have generalpurpose global memory, which is not automatically cached and exhibits high latency in comparison with the instruction throughput of GPUs. Furthermore, with earlier CUDA-enabled GPUs, there were stringent requirements for achieving optimal effective memory bandwidth, with a large loss of performance when these requirements went unmet. With the data-dependent memory access of unstructured grid based solvers, this loss of performance is almost assured. However, with due care, structured grid based solvers can meet these requirements due to the regular memory access patterns of such solvers, as described in the work of (Brandvik & Pullan), and (Tolke). Further work on regular grid solvers includes that of (Phillips et al.) who have developed a 2D compressible Euler solver on a cluster of GPUs, and [Thibault et al.]184 who have implemented a 3D incompressible Navier - Stokes solver for multi-GPU systems. So far, the implementation of optimized unstructured grid based solvers for modern graphics hardware has been relatively rare, perhaps due to these stringent requirements. In fact, just prior to its first release, [Owens et al.]185 comprehensively surveyed the field of general-purpose computation on graphics hardware (GPGPU), which included a number of primarily structured grid based solvers, such as those of [Harris]186, [Scheidegger et al.]187, and [Hagen et al.]188 However, the architecture has changed substantially and many of the limitations of GPGPU via traditional graphics APIs such as OpenGL are no longer an issue. The most recent CUDA-enabled GPUs have looser requirements for achieving high effective memory bandwidth. Roughly speaking, memory no longer needs to be accessed in a specific order by consecutive threads. Rather, high effective memory bandwidth can be achieved as long as consecutive threads access nearby locations in memory, which is called coalescing. Thus, if an Andrew Corrigan, Fernando Camelli, Rainald Lohner, and John Wallin, “Running Unstructured Grid Based CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 184 Thibault1, J. and Senocak, I., “CUDA Implementation of a Navier-Stokes Solver on Multi-GPU Desktop Platforms for Incompressible Flows," 47th AIAA Aerospace Sciences Meeting Including The New Horizons Forum and Aerospace Exposition, No. AIAA 2009-758, January 2009. 185 Owens, J. D., Luebke, D., Govindaraju, N., Harris, M., Krger, J., Lefohn, A. E., and Purcell, T. J., “A Survey of General-Purpose Computation on Graphics Hardware," Computer Graphics Forum, Vol. 26, No. 1, 2007. 186 Harris, M., “Fast Fluid Dynamics Simulation on the GPU," GPU Gems, chap. 38, Addison-Wesley, 2004. 187 C. Scheidegger, J. Comba, R. C., “Practical CFD simulations on the GPU using SMAC." Computer Graphics Forum, Vol. 24, 2005. 188 12Hagen, T., Lie, K.-A., and Natvig, J., “Solving the Euler Equations on Graphics Processing Units," Proceedings of the 6th International Conference on Computational Science, Vol. 3994 of Lecture Notes in Computer Science, Springer, May 2006. 183

167

appropriate memory access pattern is obtained, one can expect that modern GPUs will be capable of achieving high effective memory bandwidth and in general high performance for unstructured grid based CFD solvers. The purpose of this work is to study techniques which achieve this. 9.5.3.2 Implementation on Graphics Hardware The performance-critical portion of the solver consists of a loop which repeatedly computes the time derivatives of the conserved variables [see the Corrigan et. al.]189. The conserved variables are then updated using an explicit Runge-Kutta time-stepping scheme. The most expensive computation consists of accumulating flux contributions and artificial viscosity across each face when computing the time derivatives. Therefore, the performance of the CUDA kernel which implements this computation is crucial in determining whether or not high performance is achieved, and is the focus of this section. 9.5.3.3 Test Cases The performance of the GPU code was measured on a prototype NVIDIA Tesla GPU, supporting compute capability 1.3, with 24 multiprocessors. A NACA 0012 wing in supersonic (M = 1.2 ; α = 0) flow was used as a test case where the pressure contours are plotted in Figure 9.9-(Left). Timing measurements when running in single-precision and for a variety of meshes, showing an average performance scaling factor of 9.4X in comparison to the OpenMP code running on four cores and 32.6X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor of 3.9X using redundant computation in comparison to pre-computed flux contributions. Timing measurements when running in double-precision are given in Figure 9.10-(TOP) for a variety of meshes, showing an average performance scaling factor of 1.56X in comparison to the OpenMP code running on four cores and 4.7X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor of 1.1X using redundant computation in comparison to pre-computed flux contributions.

Figure 9.9

Pressures at the Surface and Plane for the NACA 00012 (Left) and at the Surface for the Missile (Right)

Andrew Corrigan, Fernando Camelli, Rainald Lohner, and John Wallin, “Running Unstructured Grid Based CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 189

168

Figure 9.10

Running Times in double Precision Per Element Per Iteration for the NACA0012 (top) and Missile (bottom)

169

A missile in supersonic (M = 1.2 ; α= 8 degrees) flow was used as an additional test case. The pressure contours are plotted in Figure 9.9-(right). Timing measurements when running in double-precision are given in Figure 9.10-(bottom) for a variety of meshes, showing an average performance scaling factor of 2.5x in comparison to the OpenMP code running on four cores and 7.4X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor 1.63X using redundant computation in comparison to pre-computed flux contributions. For additional, consult the [Corrigan et. al.]190.

Software Consideration and Message Passing Interface (MPI) MPI (Message Passing Interface) is a message passing standard for homogeneous and heterogeneous parallel and distributed computing systems. The development of the MPI standard is a multinational effort which was initiated in 1992 and is supported by ARPA, NSF and the Commission of the European Community. A good introduction to MPI is provided by Foster191, and a brief description is presented in192. MPI is a library, not a language. It consists of subroutines that are called from FORTRAN, C, or C++ programs to facilitate parallelization of programs. An MPI program includes one or more processes which communicate with each other through calls to MPI library routines. There are two types of communications, namely, point-to-point communication between pairs of processes, and collective communication between groups of processes. Several variants of receive & send" functions are provided to enable users to achieve peak performance. Two basic syntax in FORTRAN and C of “hello world program” is provided above193.

#include /* Need to include this to be able to hook into the MPI API */ #include int main(int argc, char *argv[]) { int numprocs, rank; /* Initializes MPI */ MPI_Init(&argc, &argv); /* Figures out the number of processors I am asking for */ MPI_Comm_size(MPI_COMM_WORLD, &numprocs); /* Figures out which rank we are */ MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("Process %d out of %d\n", rank, numprocs); /* Need this to shutdown MPI */ MPI_Finalize(); } #include PROGRAM hello /* Need to include this to to bebe able to to hook into thethe MPI APIAPI */ !### Need to include this able hook into MPI #include ### int main(int argc, char *argv[]) { INCLUDE 'mpif.h' int numprocs, rank; INTEGER*4 :: numprocs, rank, ierr /* Initializes MPI */ ### !### Initializes MPI MPI_Init(&argc, &argv); CALL MPI_INIT(ierr) /* Figures outout thethe number of of processors I am asking forfor */ !### Figures number processors I am asking MPI_Comm_size(MPI_COMM_WORLD, &numprocs); ### /* Figures out which rank we are */ CALL MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, MPI_Comm_rank(MPI_COMM_WORLD, &rank); ierr) printf("Process out of %d\n", rank, !### Figures out%d which rank we are ###numprocs); /* Need this to shutdown MPI */ CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) MPI_Finalize(); write(*,*) 'Process', rank, 'out of', numprocs }!### Need this to shutdown MPI ### CALL MPI_FINALIZE(ierr) END PROGRAM hello

PROGRAM hello !### Need to include this to be able to hook into the MPI API 190 Andrew Corrigan, Fernando Camelli, Rainald### Lohner, and John Wallin, “Running Unstructured Grid Based INCLUDE 'mpif.h' CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 191 I. Foster, “Designing and Building Parallel Programs." INTEGER*4 :: numprocs, rank, ierr http://www.mcs.anl.gov/dbpp/. 192 L. Clarke, I. Glendinning, and R. Hempel, “The MPI !###Message InitializesPassing MPI ### Interface Standard," March 1994. 193 Paul Edmon, “Introduction to Parallel Programming CALL MPI_INIT(ierr) and MPI”, FAS Research Computing, Harvard !### Figures out the number of processors I am asking for University. ### CALL MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr) !### Figures out which rank we are ### CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) write(*,*) 'Process', rank, 'out of', numprocs

170

Cloud Computing: Definition and Features194 From the technical definition (Mell et al. 2011) “cloud computing is a model for enabling ubiquitous, convenient on-demand network access to a shared pool of resources (e.g., network servers, storage, applications and services) that can be rapidly provisioned and released with minimal management effort or service provider interaction.” Clouds have three service models as described below:  First, “Infrastructure as a Service” (IaaS) that refers to on-demand provisioning physical resources, usually in terms of Virtual Machines (VMs). The consumers can deploy and run arbitrary software and they do not manage or control the underlying cloud infrastructure. Examples of IaaS providers include: Amazon EC2, Windows Azure Virtual Machines, Google Compute Engine, GoGrid, and Flexi scale.  Second, “Platform as a Service” (PaaS) which refers to platform layer resources, including operating system support, database, web server and software development frameworks. This means users host an environment for their applications and can control them, but they cannot control the operating system, hardware or network that are using. PaaS provider examples include: Google App Engine and Microsoft Windows Azure Compute.  The last model is “Software as a Service” (SaaS) that refers to providing on-demand applications over Internet, in a way that providers install and operate application software on a cloud infrastructure and users access them through an interface via Internet. SaaS pricing model is typically yearly or monthly. Related examples include: Google Apps, Microsoft Office 356 and Autodesk 360. In addition, clouds have four deployment models. In Public Cloud the infrastructure is provisioned by general public for open use. In Private Cloud, the infrastructure is provisioned by single organization for exclusive use. It can be managed by organization or third parties or some combination of them. Private cloud offers the highest degree of control over performance, reliability and security. However, it does not provide benefits such as no up-front cost. In Community Cloud, the infrastructure is shared by several organizations with same concerns. Hybrid Cloud is a combination of two or more clouds that remain unique entities, but are bound together by standardized or proprietary technologies that enables data and application portability Cloud computing provisioning relies heavily on virtualization, which enables offering the homogenous service simultaneously to all users. The main reasons of virtualization are abstraction and encapsulation. One important advantage of the virtualization is that on the same physical infrastructure, different runtime environment can exist without re-initialization of hardware. Runtime environment can be managed easily and start/stop quickly

High Performance Computing (HPC) Scientists, engineers and analysts in virtually every field are turning to High Performance Computing (HPC) to solve today’s vital and complex problems195. Simulations are increasingly replacing expensive physical testing, as more complex environments can be modeled and in some cases, fully simulated. High-performance computing encompasses advanced computation over parallel processing, enabling faster execution of highly compute intensive tasks such as climate research, molecular modeling, physical simulations, cryptanalysis, geophysical modeling, automotive and aerospace design, financial modeling, data mining and more. Figure 9.11 shows Maui High Performance Computing Center with 1280 servers and Mellanox InfiniBand interconnect 42.3T Flops (Courtesy’s of Mellanox Technologies). HPC clusters have become the most common building blocks 194 195

2014 ASHRAE/IBPSA-USA Building Simulation Conference Atlanta, GA September 10-12, 2014 Mellanox Technologies Inc., 2006.

171

for high-performance computing, not only because they are affordable, but because they provide the needed flexibility and deliver superior price/performance compared to proprietary Symmetric Multi-Processing (SMP) systems, with the simplicity and value of industry standard computing. Real-world application performance depends on the performance of the various cluster’s key elements the processor, the memory, and the interconnect. The interconnect controls the data transfer between servers, and has a high influence on the CPU efficiency and memory utilization. Transport off-load interconnect architectures, unlike the “on-loading” ones, eliminate the need of dealing with the protocol processing within the CPU and therefore increase the number of cycles available for computational tasks. If the CPU is busy moving data and handling network protocol processing, it is unable to perform computational work, and the overall productivity of the system is severely degraded. The memory copy overhead includes the resources required to copy data buffers from the network device to the kernel memory and then from the kernel memory to the application memory. This approach requires multiple memory accesses before the data is placed in its final destination. While it is not a major problem for small data transfers, it is a big problem for larger data transfers. This is where the interconnect zero-copy capabilities eliminates the memory bandwidth bottleneck without involving the CPU in the network data transfer. The interconnect bandwidth and latency have traditionally been used as two metrics for assessing the performance of the system’s interconnect fabric. However, these two metrics are typically not sufficient to determine the performance of real world applications. Typical real-world applications send messages ranging from 64 Byte to 4 Megabyte using not only point-to-point communication but a diverse Figure 9.11 Maui High Performance Computing Center mixture of communication patterns, with 1280 servers including collective and reduction patterns in the case of MPI. In some cases, interconnect vendors create artificial benchmarks, such as message rate, and apply bombastic marketing slogans to these benchmarks such as “Hyper messaging”. Message rate is yet another single point in the point-to-point bandwidth graph. If the traditional interconnect bandwidth indicates the maximum available bandwidth (single point), message rate indicates the bandwidth for message size of zero or 2 bytes. The single points of data, give some indication for the interconnect performance, but are far from describing the real world application performance. The interactive combination of those points, together with others (CPU overhead, zero copy etc.), will determine the overall ability of the connectivity solution. The difference between theoretical power and what is actually delivered is measured as processor efficiency. The more CPU cycles used to get the data out the door by “filling the wire” due to protocol and data transfer inefficiencies, the less cycles are available for the application. When comparing latencies of different interconnects, one needs to pay attention to the interconnect architecture. 1usec latency “on-loading” interconnect versus 2usec latency “off-load” solution is similar to a case when one needs to decide between two cars that show the same

172

horsepower (i.e. CPU). Both engines are capable of 200 miles per hour, but the first car, due to “onloading”, limits the actual engine power to 75 miles per hour (the engine power must be used for other tasks). The Second car has no limitations on the engine, but its wheels can tolerate only 150 miles per hour. The knowledge on the wheels tolerance (i.e. latency), as a single point of data, is definitely misleading. Real Application Performance InfiniBand is a proven interconnect for clustered server solutions, and one of the leading connectivity solution for highperformance computing. InfiniBand was designed as a general I/O and in practice provides low-latency and the highest link speed. Computational Fluid Dynamics (CFD) is one of the branches of fluid mechanics that uses numerical methods and algorithms to solve and analyze problems that involve fluid flows. FLUENT provides a set of benchmark problems which represent typical current usage and covering a wide range of mesh sizes and physical models. The problems selected represent a Figure 9.12 Performance rate of two HPC for benchmark CFD range of simulations typical of Analysis those which might be found in industry. The principal objective of this benchmark suite is to provide comprehensive and fair comparative information of the performance of code. on available hardware platforms. The benchmark represents the computation of the exterior flow field around a simplified model of a passenger sedan (see Figure 9.12). The simulation geometry was used for the Japan External Aerodynamics competition. A viscous-hybrid grid with prismatic cells is used to adequately model the boundary layer regions (number of cells 3,618,080, cell type hybrid, models k-ε turbulence, solver segregated implicit). Choosing the right interconnect In both cases of FLUENT benchmarks, Mellanox InfiniBand shows higher performance and better super-linear scaling comparing to QLogic InfiniPath. FLUENT’s CFD application is a latency-sensitive application, and the results shown here are good examples on how pure latency benchmarks can be misleading when choosing the right interconnect. In order to determine the system’s performance, one should take into consideration the entire interconnect architecture (such as off-loading versus on-loading) and the ability of scaling, rather than just single points of data.

Grid Computing vs. HPC The origins of HPC/Grid exist within the academic community where needs arose to crunch large data sets very early on. Think satellite data, genomics, nuclear physics, etc. Grid, effectively, has been around since the beginning of the enterprise computing era, when it became easier for academic research institutions to move away from large mainframe-style supercomputers (e.g. Cray, Sequent)

173

towards a more scale-out model using lots of relatively inexpensive x86 hardware in large clusters. The emphasis here on relatively. Most x86 clusters today are built out for very high performance and scalability, but with a particular focus on performance of individual components (servers) and the interconnect network for reasons that I will explain below. The price/performance of the overall system is not as important as aggregate throughput of the entire system. Most academic institutions build out a grid to the full budget they have attempting to eke out every ounce of performance in each component196.

HPC vs. HSC The reality is that High Scalability Computing is ideal for the majority of EPP grid workloads. HPC is a different beast altogether as many of the MPI workloads require very low latency and servers with individually high performance. It turns out however, that all MPI workloads are not the same. The lower Figure 9.13 Scope of HPC and HSC bottom of the top part of that pyramid is filled with MPI workloads that require a great network, but not an InfiniBand network. (see Figure 9.13).

The Moral of the Story So, what we have learned is that scalable computing is different from computing optimized for performance. That cloud can accommodate grid and HPC workloads, but is not itself necessarily a grid in the traditional sense. More importantly, an extremely overlooked segment of grid (EPP) has pressing needs that can be accommodated by run-of-the-mill clouds such as EC2. In addition to supporting EPP workloads that run on the ‘regular’ cloud some clouds may also build out an area designed specifically for ‘HPC’ workloads. In other words, grid is not cloud, but there are some relationships and there is obviously a huge opportunity for cloud providers to accommodate this market segment.

HPC vs. Parallel Computing The terms "high performance computing" and "parallel computing" are ambiguous197. Parallel computing is one mechanism (resources are either added linearly O(n) or in special cases SIMD cases of O(n)2). In parallel computing, you are expected to maintain some level of consistency, and perhaps determinism. When you are doing HPC, you are putting performance at a premium. High performance computing can indeed cover parallel computing, but it can also include the efficient use of caches, TLB, SIMD instructions, and other things that are high performance but not parallel.

196 197

Randy Bias, “Grid, Cloud, HPC ... What's the Diff? “, Posted on Cloud scaling Blog, 2010. Victor Eijkhout, postdoctoral and industrial experience in HPC.

174

HPC vs. HTC There are many differences between high-throughput computing (HPC), high-performance computing (HPC). HPC tasks are characterized as needing large amounts of computing power for short periods of time, whereas HTC tasks also require large amounts of computing, but for much longer times (months and years, rather than hours and days). HPC environments are often measured in terms of FLOPS. The HTC community, however, is not concerned about operations per second, but rather operations per month or per year. Therefore, the HTC field is more interested in how many jobs can be completed over a long period of time instead of how fast.

175

10 CFD and HPC Trends Forecasted for 2030 CFD codes utilize High Performance Computing (HPC) systems, so understanding where HPC technology might be in the 2030 timeframe is an important component of creating a vision for CFD codes in 2030198. Of course, forecasting where HPC technologies will be in the future requires a significant amount of extrapolation, which is especially hard in such a fast changing area as HPC. The fastest current systems can perform more than199 peta-FLOPS (1 petaFLOPS is 1015 floating point operations per second) and the HPC community is working toward systems capable of 1018 FLOPS (exaFLOPS), which are expected sometime between 2018 and 2023. Some work is even looking at 1021 FLOPS (zetaFLOPS). However, reaching that level of performance is unlikely without radically new technologies. A common, though controversial, measure of HPC systems is the total number of floating point operations a given system can perform in a second while solving a large linear system of equations using Gaussian elimination; this is the HP LINPACK benchmark. Twice a year a list of the top 500 systems in the world against which those numbers are measured is published by the Top500 organization. The current list (June 2013) is topped by the Tianhe-2 system, developed by China’s National University of Defense Technologies, which achieved 33.86 petaFLOPS on the LINPACK benchmark. Here, we will estimate only the peak floating-point performance in terms of the maximum number of operations that can be performed per second. We note that the performance of many applications, including CFD applications, may be more accurately estimated by using sustained memory bandwidth; for the purposes, provided that other aspects of system performance remains the same

Comparison of Semiconductor fabrication sizes in HPC A significant measure of a processor is the feature size of its components. The smaller the features, the more elements can be placed in the same area, and hence the more powerful a processor becomes. Feature size also has a direct impact on power consumption, and heat generation, with smaller sizes being better. Thus, forecasting feature sizes of future processors is very important. Unfortunately, the industry has not always been good in that forecasting, which is one reason why predicting where HPC technology will be in 2030 is particularly hard. For example, in 2005, the International Technology Roadmap for Semiconductors (ITRS) forecasted a 22-nm (nm=10-9m) gate length by 2008; that is, the structures in a modern processor were forecast to have Figure 10.1 Changing Predictions About Semiconductor Sizes features with sizes around 22 nm. However, in 2008 the J, Slotnick and A, Khodadoust, J, Alonso , D, Darmofal , W, Gropp , E, Lurie , Dimitri Mavriplis , “CFD Vision 2030 Study: A Path to ,Revolutionary Computational Aero sciences” , NASA/CR–2014-218178. 199 Kraft, E. M., “Integrating Computational Science and Engineering with Testing to Re-engineer the Aeronautical Development Process”, AIAA Paper 2010-0139, 48th AIAA Aerospace Sciences Meeting, January 2010, 10.2514/6.2010-139. 198

176

forecast date moved to 2011 and in 2011, it moved again to 2012. A similar slip occurred for other (smaller) gate lengths (see Figure 10.1). Note that the forecasts of the ITRS combine inputs from all major chip manufacturers, equipment suppliers, and research communities and collections, so it represents the combined wisdom of the industry. Nevertheless, as Figure 10.1 shows, forecasting a key feature of even this basic component of processors is hard. Another critical component of HPC capability in 2030 is the advances in software infrastructure and programming methodologies that will be necessary to take advantage of these future HPC systems. The ultimate purpose for these systems is to solve the most pressing problems in academia and industry. In particular, industrial users pursue this technology because of the large impact on future product designs, and the ability to avoid or minimize the use of other, more costly methods such as wind tunnels or other types of physical tests.

Current Status of CFD At present, CFD is used extensively in the aerospace industry for the design and analysis of air and space vehicles and components. However, the penetration of CFD into aerospace design processes is not uniform across vehicle types, flight conditions, or across components. CFD often plays a complementary role to wind tunnel and rig tests, engine certification tests, and flight tests by reducing the number of test entries and/or reducing testing hours200-201. But, in many circumstances, CFD provides the only affordable or available source of engineering data to use in product design due to limitations either with model complexity and/or wind tunnel capability, or due to design requirements that cannot be addressed with ground-based testing of any kind. As a result, CFD technology development has been critical in not only minimizing product design costs, but also in enabling the design of truly novel platforms and systems. Generally, the design process is composed of three key phases: conceptual design, preliminary and detailed design, and product validation. The current usage of CFD tools and processes in all three design phases is summarized below. Conceptual Design CFD is often used in the early, conceptual design of products where it has been both previously calibrated for similar applications using data-morphing techniques, as well as for new configurations where little or no engineering data is available to guide design decisions. Simplified models are typically used during the conceptual optimization phase to allow reasonably accurate trades to be made between drag, fuel consumption, weight, payload/range, thrust, or other performance measures. Use of simplified models is necessary to allow often time consuming optimization processes to be used in the overall design effort, but inherently places conservatism into the final design. This conservatism derives from the use of models that are too similar within the existing product design space, other geometric simplifications, or the use of low-fidelity CFD tools that trade off flow physics modeling accuracy for execution speed. Preliminary/Detailed Design Once a product development program is launched, CFD is a necessary and uniformly present tool in the detailed configuration design process. For example, CFD is indispensable in the design of cruise wings in the presence of nacelles for commercial airplanes, and for inlet and nozzle designs; wind tunnels are used to confirm the final designs202-203. In both military and commercial aircraft design Jameson, A., “Re-engineering the Design Process Through Computation”, AIAA Journal of Aircraft, Vol. 36, 1999, pp. 36-50. 201 Goldhammer, M. I., “Boeing 787 – Design for Optimal Airplane Performance”, CEAS/KATnet Conference on Key Aerodynamic Technologies, Bremen, Germany, June 2005. 202 Malik, M. R. and Bushnell, D. M. (eds.), “Role of Computational Fluid Dynamics and Wind Tunnels in Aeronautics R&D”, NASA TP-2012-217602, September 2012. 203 Goldhammer, M. I., “Boeing 787 – Design for Optimal Airplane Performance”, CEAS/KATnet Conference on 200

177

processes, CFD is the primary source of data for aircraft load distributions and ground effect estimations. Similarly, gas turbine engine manufacturers rely on CFD to predict component design performance, having reduced the number of single-component rigs substantially as CFD capability has become more mature. Increasingly, multicomponent and Multiphysics simulations are performed during the design cycle, but the long clock times often associated with these processes restricts their widespread adoption. For space exploration, CFD is often used to gain important insight into flow physics used to properly locate external components on the surface of launch vehicles or spacecraft. CFD is also increasingly providing substantial portions of the aero and propulsion performance database. In many cases, wind tunnel data is used only to anchor the CFD data at a few test points to provide confidence in the CFD database. CFD is the primary source of data for the hypersonic flight regime when ground testing is limited or does not exist. Product Validation and Certification As the product development process moves into the validation phase and certification testing, CFD is often used to confirm performance test results, assess the redesign of components that show potential for improved performance, and to answer any other questions that arise during product testing. Typically, product configurations evolve over the testing period based on a combination of measured results and engineering judgment bolstered by the best simulation capability available. In general, CFD modeling capability grows to capture the required scope and physics to answer the questions raised during testing. The expense of responding to often unplanned technical surprises which results in more time on the test stand or in flight test, or changes in hardware drives conservatism into aerospace designs and is a significant motivation for improving the accuracy and speed of CFD. If CFD is sufficiently accurate and fast, engineers can move from their traditional design space with greater confidence and less potential risk during testing. For each of these design phases, the performance of CFD is of critical. CFD usage of High Performance Computing (HPC) The effectiveness and impact of CFD on the design and analysis of aerospace products and systems is largely driven by the power and availability of modern HPC systems. During the last decades, CFD codes were formulated using message passing (e.g., MPI) and thread (e.g., OpenMP) software models for expressing parallelism to run as efficiently as possible on current generation systems. However, with the emergence of truly hierarchical memory architectures having numerous graphical processing units (GPUs), new CFD algorithms may need to be developed to realize the potential performance offered by such systems. Government labs, such as Oak Ridge National Lab (ORNL), Argonne National Lab (ANL), and the NASA Advanced Supercomputing (NAS) facility at NASA Ames research center, have often led the way with the acquisition and testing of new hardware. Much research on testing and tailoring of CFD algorithms takes place on these platforms with heavy participation from academia, national labs and to some extent industry as well. Government computing resources are also used to tackle large-scale calculations of challenge problems, such as the detailed direct numerical simulation (DNS) of spray injector atomization or high fidelity simulations of transition and turbulent separation in turbomachinery. However, because of the high cost of these leadership-class systems, industry and academia often purchase smaller commodity clusters utilizing similar types of processors when the latest hardware technology is fully demonstrated on CFD problems and other important applications. Turbulence Modeling Current practices for CFD-based workflows utilize steady Reynolds-average Navier-Stokes (RANS)

Key Aerodynamic Technologies, Bremen, Germany, June 2005.

178

with 1 or 2-equation turbulence models204-205, although hybrid unsteady RANS/LES methods are increasingly common for certain classes of simulations in which swirling and intentionally separated flows are dominant, such as combustors. Techniques to combine near-wall RANS regions and outer flow field, large-eddy simulation (LES) regions in these hybrid methods are immature. Many CFD design processes include an estimation of boundary layer transition, using a range of models, from purely empirical to coupled partial-differential equation (PDE) solutions of stability equations206-207. Both approaches involve much empiricism, may be missing some modes of transition, and are evolving. As a result, no generalized transition prediction capability is in widespread use in NavierStokes CFD, and the default practice is to run the codes “fully turbulent”. Steady-state CFD accounts for a vast majority of simulations while unsteady flow predictions are inherently more expensive and not yet uniformly routine in the design process, with some exceptions. Process Automation Current CFD workflows are often paced by the geometry preprocessing and grid generation phases, which are significant bottlenecks. In some cases, where the design effort involves components of similar configurations, specialized, automated processes are built that considerably reduce set-up time, execution of the CFD solver, and post-processing of results. This process to production capability of the CFD workflow only occurs in areas where the design work is routine and the investment in automation makes business sense; single prototype designs and novel configurations continue to suffer the pacing limits of human-in-the-loop workflows because the payoff for automating is not evident. This issue is not unique to the aerospace industry. Solution Uncertainty and Robustness In practice, CFD workflows contain considerable uncertainty that is often not quantified. Numerical uncertainties in the results come from many sources, including approximations to geometry, grid resolution, problem setup including flow modeling and boundary conditions, and residual convergence. Although NASA and professional organizations such as ASME and AIAA have created standards for the verification and validation of CFD and heat transfer analyses, such techniques are not widely used in the aerospace industry. With a few notable exceptions, CFD is carried out on fixed grids that are generated using the best available practices to capture expected flow features, such as attached boundary layers208. Such approaches cannot reliably provide adequate resolution for flow features when locations are not known a priori, such as shocks, shear layers, and wakes. Although grid refinement is often seen as a solution to addressing grid resolution issues, it is seldom done in practice because uniform refinement is impractical in 3D. Adaptive mesh refinement strategies offer the potential for superior accuracy at reduced cost, but have not seen widespread use due to robustness, error estimation, and software complexity issues. Achieving consistent and reliable flow solver or residual convergence remains problematic in many industrial cases. Although many CFD codes are able to demonstrate convergence for a few simple problems, for many flows involving difficult flow physics or complex geometries such as an aircraft in high-lift configuration, many of the current solver techniques are not strong enough to ensure robust convergence. Engineering judgment is Spalart, P. R. and Allmaras, S. R., "A One-Equation Turbulence Model for Aerodynamic Flows", La Recherche Aerospatiale, No. 1, 1994, pp. 5-21. 205 Wilcox, D. C., Turbulence Modeling for CFD, DCW Industries, 3rd edition, November 2006. 206 Stock, H.W., and Haase, W., “Navier-Stokes Airfoil Computations with eN Transition Prediction Including Transitional Flow Regions”, AIAA Journal, Vol. 38, No. 11, pp. 2059–2066, 2006, 10.2514/2.893. 207 Langtry, R. B., Menter, F. R., “Correlation-Based Transition Modeling for Unstructured Parallelized Computational Fluid Dynamics Codes”, AIAA Journal, Vol.47, pp. 2894-2906, 2009, 10.2514/1.42362. 208 Mavriplis, D. J., Vassberg, J., Tinoco, E., Mani, M., Brodersen, O., Eisfeld, B., Wahls, R., Morrison, J., Zickuhr, T., Levy, D., and Murayama, M., “Grid Quality and Resolution Issues from the Drag Prediction Workshop Series”, AIAA Journal of Aircraft, Vol. 46, No. 3, pp. 935-950, March 2009. 204

179

required to interpret results that are not well converged, which introduces conservatism into decision making. Furthermore, the use of steady-state flow solvers itself is in question for many flows of engineering interest. Multidisciplinary Analysis and Optimization (MDAO) Although the basic concepts of MDAO are fairly well accepted in the community, the routine use of MDAO methods is not, by any means, universal. At moderate levels of fidelity, it is common industrial practice to perform coupled multidisciplinary analyses (MDA) of the most tightly integrated disciplines in a design. Aero structural analyses, conjugate heat transfer calculations, and aeroacoustic simulations all tend to take place in aircraft, spacecraft, jet engine, and rotorcraft analysis and design processes. High fidelity CFD is not routinely used in such MDAs, although recent years have witnessed a significant rise in the coupling of state-of-the-art CFD with additional disciplines. While frameworks for the coupling of disciplinary analyses are widely available, the ability to couple CFD with other high fidelity descriptions of participating disciplines is limited by the availability of coupling software and, more fundamentally, by a lack of general methodologies for accurate, stable, and conservative MDAs. The application of optimization techniques in industry is mostly limited to single-discipline simulations209-210. Although conceptual design tools have long benefited from multidisciplinary optimization (MDO) approaches, high fidelity CFD-based optimizations are still rare. During the past decade, the development of advanced surrogate modeling techniques and the introduction of adjoint-based optimal shape design techniques have enabled the use of CFD in aerodynamic optimization of aircraft and gas turbine components. However, the use of optimization with multiple disciplines treated using high-fidelity methods is still within the realm of advanced research and is by no means a routine practice.

Vision of CFD in 2030 as anticipated by NASA This is in fact a mirror image of the report done by USDOE, which will be covered later, but with emphasis on CFD. Given the inherent difficulties of long-term predictions, our vision for CFD in 2030 is grounded on a desired set of capabilities that must be present for a radical improvement in CFD predictions. Of special interests are critical flow phenomena associated with the key aerospace application, including commercial/military aircraft, engine propulsion, rotorcraft, space exploration, launch vehicle programs, air-breathing space-access, and spacecraft entry211. This set of capabilities includes not only the accurate and efficient prediction of fluid flows of interest, but also the usability of CFD in broader contexts (including uncertainty quantification, optimization, and multidisciplinary applications) and in streamlined/automated industrial analysis and design processes. To complicate things further, CFD in 2030 must effectively leverage the uncertain and evolving environment of HPC platforms that, together with algorithmic improvements, will be responsible for a large portion of the realized improvements. The basic set of capabilities for CFD must include, at a minimum:  Emphasis on physics-based, predictive modeling. In particular, transition, turbulence, separation, chemically reacting flows, radiation, heat transfer, and constitutive models must reflect the underlying physics more closely than ever before.  Management of errors and uncertainties resulting from all possible sources:

Jeffrey Slotnick and Abdollah Khodadoust, Juan Alonso, David Darmofal, William Gropp, Elizabeth Lurie ,Dimitri Mavriplis ,”CFD Vision 2030 Study: A Path to Revolutionary Computational Aerosciences”, NASA/CR– 2014-218178. 210 Same as above. 211 Same as above. 209

180

1. Physical modeling errors and uncertainties addressed, 2. Numerical errors arising from mesh and discretization inadequacies, and 3. Uncertainties derived from natural variability, as well as epistemic uncertainties due to lack of knowledge in the parameters of a particular fluid flow problem.  A much higher degree of automation in all steps of the analysis process is needed including geometry creation, mesh generation and adaptation, the creation of large databases of simulation results, the extraction and understanding of the vast amounts of information generated, and the ability to computationally steer the process. Inherent to all these improvements is the requirement that every step of the solution chain executes high levels of reliability/robustness to minimize user intervention.  Ability to effectively utilize massively parallel, heterogeneous, and fault-tolerant HPC architecture. For complex physical models with nonlocal interactions, the challenges of mapping the underlying algorithms onto computers with multiple memory hierarchies, latencies, and bandwidths must be overcome.  Flexibility to tackle capability and capacity-computing tasks in both industrial and research environments so that both very large ensembles of reasonably-sized solutions (such as those required to populate full-flight envelopes, operating maps, or for parameter studies and design optimization).  Seamless integration with multidisciplinary analyses that will be the norm in 2030. Without sacrificing accuracy or numerical stability of the resulting coupled simulation, and without requiring a large amount of effort such that only a handful of coupled simulations are possible. Included in this desired set of capabilities is a vision of the interaction between the engineer/scientist, the CFD software itself, its framework and all the ancillary software dependencies (databases, modules, visualization, etc.), and the associated HPC environment. A single engineer/scientist must be able to conceive, create, analyze, and interpret a large ensemble of related simulations in a time-critical period (e.g., 24 hours), without individually managing each simulation, to a pre-specified level of accuracy. There should be less emphasis on the mechanics of running and collecting the information, and more emphasis on interpreting and understanding the results of the work. At the moment, CFD is not yet sufficiently predictive and automated to be used in critical/relevant engineering decisions by the non-expert user, particularly in situations where separated flows are present212.  Finally, we define a set of Grand Challenge (GC) problems that are bold and in fact may not be solvable in the 2030 timeframe, but are used as drivers to identify critical technologies in need of investment, and to serve as benchmarks for continually measuring progress toward the long term development goals. These GC problems are chosen to embody the requirements for CFD in 2030, and cover all important application areas of relevance to NASA’s aeronautics mission, as well as important aspects of NASA’s space exploration mission213. They are: 1. LES of aircraft configuration across the full flight envelope. 2. Off-design turbofan engine transient simulation. 3. MDAO of a highly flexible advanced aircraft configuration.

Jeffrey Slotnick and Abdollah Khodadoust, Juan Alonso, David Darmofal, William Gropp, Elizabeth Lurie ,Dimitri Mavriplis ,”CFD Vision 2030 Study: A Path to Revolutionary Computational Aerosciences”, NASA/CR– 2014-218178. 213 See Previous. 212

181

Technology Roadmap to achieve GC challenge The CFD technology roadmap is a complete and concise view of the key research technologies and capabilities that must be developed and integrated into production CFD. The individual elements on the roadmap were identified based on the results of the CFD user survey, detailed technical discussions held during the Vision 2030 CFD workshop, and from interactions among our team members. Key technology milestones, proposed technology demonstrations, and critical decision gates are positioned along timelines, which extend to the year 2030. Separate timelines are identified for each of the major CFD technology elements that comprise the overall CFD process. The key milestones indicate important advances in CFD technologies or capabilities that are needed within each technology element. Technology demonstrations are identified to help verify and validate when technology advances are accomplished, as well as to validate advances toward the simulations of the GC problems identified above. Specific details of the development plan for each technology element are given below. 10.3.1.1 High Performance Computing (HPC) As mentioned previously, advances in HPC hardware systems and related computer software are critically important to the advancement of the state of the art in CFD simulation, particularly for high Reynolds turbulent flow simulations. Based on feedback from the user community survey, we envision HPC technology advancing along two separate paths. Ongoing development of exascale systems, as mentioned earlier, will continue through 2030, and represents the technology that will most likely provide the large increase in throughput for CFD simulation in the future214. However, novel technologies, such as quantum computing or molecular computing, offer a true paradigm shift in computing potential and must be carefully considered at strategic points in the overall development plan, even though the technology is at a very low level today. In order to properly address the HPC challenge, three specific thrusts must be supported. Firstly, current simulation software must be ported to evolving and emerging HPC architectures with a view toward efficiency and software maintainability. Secondly, investments must be made in the development of new algorithms, discretization, and solvers that are well suited for the massive levels of parallelism215-216. Finally, increased access to the latest large-scale computer hardware must be provided and maintained, not only for production runs, but also for algorithmic research and software development projects, which will be critical for the design and validation of new simulation tools and techniques217. We propose several key milestones that benchmark the advances that we seek: modification of NASA and related CFD codes to efficiently execute on hierarchical memory (GPU/co-processor) systems by 2020, initial evaluation of exascale performance on a representative CFD problem, and a demonstration of 30 exaFLOP performance for one or more of the proposed GC problems in the 2030 time frame. Concurrently, we stress the importance of closely observing advances in revolutionary HPC technologies, such as superconducting logic, new memory technologies, alternatives to current. Because these technologies are in their infancy, we foresee decision gates in 2020,-2025, and 2030 to establish the ability of these systems to solve a relevant model problem. Implicit in this strategy is the need to provide access to experimental hardware on a continual basis and to explore radical new Kogge, P. (Ed.), “ExaScale Computing Study: Technology Challenges in Achieving Exascale Systems”, Contractor report for AFRL Contract No. FA8650-07- C-7724, September 2008. 215 Mavriplis, D., Darmofal, D., Keyes, D. and Turner, M., “Petaflops Opportunities for the NASA Fundamental Aeronautics Program”, AIAA Paper 2007-4084,18th AIAA Computational Fluid Dynamics Conference, June 2007, 10.2514/6.2007-4084. 216 Sarkar, V. (ed.), “ExaScale Software Study: Software Challenges in Extreme Scale Systems”, DARPA, IPTO, AFRL report under contract FA8650-07-C-7724, September 2009. 217 Biswas, R., Aftosmis, M. J., Kiris, C., and Shen, B. W., “ Petascale Computing: Impact on Future NASA Missions”, Petascale Computing: Architectures and Algorithms (D. Bader, ed.), Chapman and Hall / CRC Press, 2007. 214

182

approaches to devising CFD simulation capabilities. If, at any of these decision points, the technology clearly shows its expected potential, we recommend increased investment to accelerate the use of these machines for CFD applications. 10.3.1.2 Physical Modeling Advances in the physical modeling of turbulence for separated flows, transition, and combustion are critically needed to achieve the desired state of CFD. For the advancement of turbulent flow simulation, we propose three separate tracks for research: RANS-based turbulence treatments; hybrid RANS/LES approaches where the entire boundary layer is resolved with RANS-based models, and the outer flow is resolved with LES models; and LES, including both Wall-Model and WallResolved. Details on each of the three development tracks and for transition and combustion modeling, are given below. Additionally, a longer term high-risk effort should investigate radically new approaches to physical modeling. RANS-based turbulence models continue to be the standard approach used to predict a wide range of flows for very complex configurations across virtually all aerospace product categories. They are easy to use, computationally efficient, and generally able to capture wall-bounded flows, flows with shear, flows with streamline curvature and rotation, and flows with mild separation. For these reasons, as well as the fact that RANS models will remain as an important component in hybrid RANS/LES methods, their use will continue through 2030. An advanced formulation of the RANSbased approach, where the eddy viscosity formulation is replaced with the direct modeling of the Reynolds stresses, known as the Reynolds Stress Transport method, in principle will be able to capture the onset and extent of flow separation for a wider range of flows218. Currently Hybrid RANS/LES methods show perhaps the most promise in being able to capture more of the relevant flow physics for complex geometries at an increasingly reasonable computational cost219. From the user survey, the majority of survey participants ranked the continued development of hybrid RANS/LES methods as the top priority in the area of turbulence modeling. However, as mentioned previously, several issues still exist. First, the prediction of any separation in the boundary layer will still require improvements in RANS-based methods. Second, a seamless, automatic RANSto-LES transition in the boundary layer is needed to enhance the robustness of these methods. Continued investment in hybrid RANS/LES methods to address these two critical shortcomings will be required. Additionally, more effective discretization and solvers designed specifically for LES type problems must be sought. When combined with advances in HPC hardware, these three developments will enable continued reduction in the RANS region as larger resolved LES regions become more feasible. It is fully anticipated that hybrid RANS/LES methods will become viable in production mode by the 2030 timeframe for problems typical of the proposed GCs. 10.3.1.3 Numerical Algorithms The development of novel numerical algorithms will be critical to achieving the stated CFD 2030 goals. Indeed, the proposed GCs are sufficiently ambitious that advances in HPC hardware alone during the next 20 years will not be sufficient to achieve these goals. As demonstrated in Case Study 2, even for LES of relatively simple geometries, leadership class HPC hardware in 2030 will be needed for 24-hour turnaround if existing algorithms are used. Thus, to tackle the proposed GCs, orders of magnitude improvement in simulation capabilities must be sought from advances in numerical algorithms220. The focus of investment must be on discretization and solvers that scale to massive Eisfeld, B., “Reynolds Stress Modeling for Complex Aerodynamic Flows”, Presented at the European Conference on Computational Fluid Dynamics, ECCOMAS CFD 2010, Lisbon, Portugal, June 14−17, 2010. 219 Song, F., Haase, W., Peng, S-H., and Schwamborn, D. (eds.), Progress in Hybrid RANS-LES Modeling, Springer Press, ISBN 978-3-642-31817-7, Sept. 2011. 220 Mavriplis, D., Darmofal, D., Keyes, D. and Turner, M., “Petaflops Opportunities for the NASA Fundamental Aeronautics Program”, AIAA Paper 2007-4084,18th AIAA Computational Fluid Dynamics Conference, 2007. 218

183

levels of parallelism, that are well-suited for the high-latency, deep memory hierarchies anticipated in future HPC hardware, and that are robust and fault tolerant. A well balanced research program must provide for incremental advances of current techniques (e.g., extending the scalability of current CFD methods to the exascale level whenever possible), while at the same time investing in the fundamental areas of applied mathematics and computer science to develop new approaches with better asymptotic behavior for largescale problems and better suitability for emerging HPC hardware. Discretization techniques such as higher-order accurate methods offer the potential for better accuracy and scalability, although robustness and cost considerations remain221. Investment must focus on removing these barriers in order to unlock the superior asymptotic properties of these methods, while at the same time pursuing evolutionary improvements in other areas such as low dissipation schemes, flux functions, and limiter formulations. Simultaneously, novel nontraditional approaches, such as Lattice-Boltzmann methods or other undeveloped schemes, should be investigated for special applications. Improved linear and nonlinear solvers must be developed, and here as well, the focus must be on highly scalable methods that are designed to be near optimal for the large-scale, time-implicit unsteady CFD and MDAO simulations anticipated in the future. These may include the extension of well-known matrix-based techniques, [Krylov methods]222, highly parallel multigrid methods223, or the development of completely novel approaches such as systematic upscaling methods224. Furthermore, these methods must be extensible to tightly coupled multidisciplinary problems. Investment in discretization and solvers must also consider the potential of these methods to operate on dynamically adapting meshes, to enable optimization procedures, and to incorporate advanced uncertainty quantification capabilities. In many cases, adjoint technology225-226 will be required from the outset for all of these capabilities, but the potential of other more advanced technologies such as second-order gradients [Hessians]227228 should be investigated as well. Longer term, high-risk research should focus on the development of truly enabling technologies such as monotone or entropy stable schemes in combination with innovative solvers on large-scale HPC hardware. The technology roadmap envisions the demonstration of improved robust and scalable solvers in the 2015-2017 timeframe, for both second-order and higher-order accurate methods. The demonstration of complete configuration-grid convergence technology in the 2020 time frame relies on the use of robust higher order discretization combined with improved scalable solvers and adaptive h-p refinement. Toward the 2030 time frame, it is anticipated that novel entropy stable formulations will begin to bear fruit for industrial simulations.

Kroll, N., Bieler, H., Deconinck, H., Couallier, V., van der Ven, H.; and Sorensen, K. (Eds.), “ADIGMA – A European Initiative on the Development of Adaptive High-Order Variational Methods for Aerospace Applications”, Notes on Numerical Fluid Mechanics and Multidisciplinary Design, Vol. 11, 2010, Springer. 222 Saad, Y., Iterative Methods for Sparse Linear Systems, Second Edition, SIAM, 2003. 223 Baker, A. H., Falgout, R. D., Kolev, Tz. V., and Yang, U. M., “Scaling Hypre’s Multigrid Solvers to 100,000 Cores”, High Performance Scientific Computing: Algorithms and Applications, M. Berry et al., eds., Springer (2012). 224 Brandt, A., “Multiscale Solvers and Systematic Upscaling in Computational Physics”, Computer Physics Communications, Vol 169, Issues 1–3, pp. 438-441, July 2005. 225 Jameson, A., “Aerodynamic Design via Control Theory”, ICASE Report No. 88-64, November 1988, also, J. of Scientific Computing, Vol. 3, pp. 233-260, 1988. 226 Errico, R. M., “What is an adjoint model?”, Bulletin of the American Meteorological Society, 2577– 2591, 1997. 227 Taylor, A. C., Putko, M. M., Green, L. L., and Newman, P. A., “Some Advanced Concepts in Discrete Aerodynamic Sensitivity Analysis”, AIAA Journal, Vol.41, pp. 224-1229, 2003,10.2514/2.2085. 228 Rumpfkeil, M. P., and Mavriplis, D. J., “Efficient Hessian Calculations Using Automatic Differentiation and the Adjoint Method with Applications”, AIAA Journal, Vol.48, pp. 2406-2417, 2008, 10.2514/1.J050451. 221

184

10.3.1.4 Uncertainty Quantification (UQ) With regard to uncertainty quantification, a new thrust in the area of probabilistic large-scale CFD for aerospace applications should be initiated. An initial thrust in this area should focus on enabling current aerospace CFD tools with well-known uncertainty quantification techniques, such as sensitivity analysis and propagation methods using adjoints and forward linearization, nonintrusive polynomial chaos methods, and other reduced-order model formulations229-230. Additionally, a concerted effort should be made to characterize important aerospace uncertainties and to make these available to the general research community for enabling relevant UQ research in these areas. Improved error estimation techniques must be investigated and developed, given the known deficiencies of current approaches (including adjoint methods). This will require a foundational program in the mathematics of error estimation and its application to CFD software. Finally, longer term research must focus on statistical approaches such as Bayesian techniques for quantifying more accurately modeling and other nonlinear error sources231. The technology roadmap includes an early target date of 2015 for the characterization of typical aerospace uncertainties in order to stimulate work in this area. Improved error estimation techniques will be gradually brought into the simulation capabilities and the state of these estimates will be assessed in the 2018 time frame. Comprehensive uncertainty propagation techniques including discretization error, input and parameter uncertainties in production-level CFD codes should be targeted for 2025, while the development of more sophisticated stochastic and Bayesian approaches will continue through the 2030 timeframe. 10.3.1.5 Geometry and Grid Generation Substantial new investment in geometry and grid generation technology will be required in order to meet the Vision CFD 2030 goals. In general, this area has seen very little NASA investment during the last decade, although it remains one of the most important bottlenecks for large-scale complex simulations. Focused research programs in streamlined CAD access and interfacing, large-scale mesh generation, and automated optimal adaptive meshing techniques are required. These programs must concentrate on the particular aspects required to make mesh generation and adaptation less burdensome and, ultimately, invisible to the CFD process, while developing technologies that enable the capabilities that will be required by Vision 2030-CFD applications, namely very large scale parallel mesh generation, curved mesh elements for higher order methods232-233, highly scalable dynamic overset mesh technology234, and in anisotropic adaptive methods for time-dependent problems. It is important to realize that advances in these areas will require a mix of investments in incremental software development, combined with advances in fundamental areas such as computational geometry, possibly with smaller components devoted to high risk disruptive ideas such as

Shankaran, S., and Jameson, A., “Robust Optimal Control using Polynomial Chaos and Adjoints for Systems with Uncertain Inputs”, AIAA Paper 2011-3069, 20th AIAA Computational Fluid Dynamics Conference, 2011. 230 Ng, L. W-T., Huynh, D. B. P., and Willcox, K., “Multifidelity Uncertainty Propagation for Optimization Under Uncertainty”, 12th AIAA Aviation Technology, Integration, and Operations (ATIO) Conference and 14 th AIAA/ISSMO Multidisciplinary Analysis and Optimization Conference, 2012, 10.2514/6.2012-5602. 231 Press, S. J. , Subjective and Objective Bayesian Statistics: Principles, Methods and Applications, 2nd edition, 2003, Wiley, New York. 232 Wang, L., Anderson, W. K., Erwin, J., and Kapadia, S., “High-order Methods for Solutions of Three dimensional Turbulent Flows”, AIAA Paper 2013-856, 51st AIAA Aerospace Sciences Meeting, Jan 2013. 233 Persson, P-O., Willis, D., and Peraire, J., “The Numerical Simulation of Flapping Wings at Low Reynolds Numbers”, AIAA Paper 2010-724, 48th AIAA Aerospace Sciences Meeting, Jan 2010. 234 Pulliam, T. H. and Jespersen, D. C., “Large Scale Aerodynamic Calculation on Pleiades”, Proceedings of the 21st International Conference on Parallel Computational Fluid Dynamics, Moffett Field, California, May 18−22, 2009. 229

185

anisotropic cut-cell meshes235, strand mesh ideas236, and even meshless methods237. Additionally, because significant technology currently resides with commercial software vendors, particularly for CAD interfaces and access, involving these stakeholders in the appropriate focused research programs will be critical for long-term success. Innovative approaches for achieving such partnerships must be sought out, such as the formation of consortiums for the definition and adoption of standards or other potential issues such as large scale parallel licensing of commercial software. The technology development roadmap envisions the demonstration of tight CAD coupling and production adaptive mesh refinement (AMR) in the 2015-2017 time frame, followed by maturation of large-scale parallel mesh generation in the 2020-2025 time frame, and leading ultimately to fully automated in-situ mesh generation and adaptive control for large-scale timedependent problems by 2030. 10.3.1.6 Knowledge Extraction Petascale and exascale simulations will generate vast amounts of data and various government agencies such as the NSF and DOE have instituted major programs in data-driven simulation research. In order to make effective use of large scale CFD and MDAO simulations in aerospace engineering, a thrust in data knowledge extraction should be initiated. Ideally, this should contain three components, a visualization component, a database management component, and a variable fidelity, data integration component. Methods to process and visualize very largescale unsteady CFD simulations in real time, including results from higher-order discretization, are required to support the advanced CFD capabilities envisioned in 2030. Although many of the current efforts in maturing visualization technology are being led by commercial vendors who continue to supply enhanced capabilities in this area, more fundamental research to directly embed visualization capabilities into production CFD tools optimized for emerging HPC platforms is needed to achieve real-time processing238. Moreover, the CFD capability in 2030 must provide the analyst with a more intuitive and natural interface into the flow solution to better understand complex flow physics. Foreseeing the capability of generating large databases with increasing computational power, techniques for rapidly integrating these databases, querying them in real time will be required. Finally, integrating high fidelity simulation data with lower fidelity model data, as well as experimental data from wind tunnel tests, engine test rigs, or flight-test data will provide a powerful approach for reducing overall risk in aerospace system design239. Techniques for building large-scale flexible databases are in their infancy, and range from simple software infrastructures that manage large numbers of simulation jobs to more sophisticated reduced-order models240, surrogate models, and Kriging methods241. The objective of a research thrust in this area should be to apply existing techniques to current CFD simulation capabilities at a large scale, while simultaneously performing foundational research in the Modisette, J., and Darmofal, D., “Toward a Robust, Higher-Order Cut-Cell Method for Viscous Flows”, AIAA Paper 2010-721, 48th AIAA Aerospace Sciences Meeting, Jan 2010. 236 Katz, A., Wissink, A., Sitaraman, J., and Sankaran, V., “Application of Strand Meshes to Complex Aerodynamic Flow Fields”, AIAA Paper 2010-4934, 28th AIAA Applied Aerodynamics Conference, June 2010. 237 Katz, A., and Jameson, A, “Meshless Scheme Based on Alignment Constraints”, AIAA Journal, Vol.48, pp. 25012511, 2010. 238 Wang, Y., Yu, H., and Ma, K-L, “Scalable Parallel Feature Extraction and Tracking for Large Time-Varying 3D Volume Data”, Proceedings of EGPGV 2013, May 2013, pp. 55-62. 239 The 1st Workshop on Integration of Experimental Fluid Dynamics (EFD) and Computational Fluid Dynamics (CFD), JAXA Special Publication SP-09-002, January 2010. 240 Washabaugh, K., Amsallem, D., Zahr, M., and Farhat, C., “Nonlinear Model Reduction for CFD Problems Using Local Reduced-Order Bases”, AIAA Paper 2012-2686, 42nd AIAA Fluid Dynamics Conference, June 2012. 241 Han, Z-H., and Görtz, S., “Hierarchical Kriging Model for Variable-Fidelity Surrogate Modeling”, AIAA Journal, Vol.50, pp.1885-1896, 2012, 10.2514/1.J051354. 235

186

development of better reduced-order models and variable fidelity models that are applicable to aerospace problems and can support embedded uncertainty quantification strategies. The technology roadmap envisions the demonstration of real time analysis and visualization of a notional 1010 point unsteady CFD simulation in 2020, and a 1011 point simulation in 2025. These technology demonstrations would be an integral part of the GC problems designed to benchmark advances in other CFD areas. The development of reduced-order models and other variable fidelity models will entail long term research and will likely remain an active research topic past the 2030 time frame. However, the technology roadmap envisions the periodic assessment of the state-of-theart in these areas at 5 to 10 year intervals, with investment directed toward demonstrating promising approaches on large-scale aerospace applications. 10.3.1.7 Multidisciplinary Design and Optimization The ability to perform CFD-based multidisciplinary analysis (MDA) and analysis/optimization (MDAO) relies on the availability of future capabilities that need to be developed between now and 2030. Pervasive and seamless MDAs (that can be routinely exercised in industrial practice for configuration studies, e.g., full aero-thermo-elastic/aero-acoustic simulations of entire airframe/propulsion systems including shielding) will require the development of accepted standards and APIs for disciplinary information and the required multidisciplinary couplings (such as with acoustics, combustion, structures, heat transfer, radiation). A concerted effort is envisioned that results in a set of standards available to the community around 2016. In parallel with this effort, it will also be necessary to develop high-fidelity coupling techniques that guarantee the accuracy and stability of high fidelity, tightly coupled MDAs242, while ensuring that the appropriate conservation principles are satisfied with errors below acceptable thresholds. This capability, together with the coupling software that includes such information transfers must be available around 2018. Together, the standards and the coupling techniques/software would enable demonstrations of two-way coupled MDAs with the best and most robust existing CFD solvers of the time, and guaranteeing coupling fidelity by the year 2020. Such demonstrations can focus on multiple aerospace problems of interest, including aircraft aero-structural/aero-elastic analyses, aircraft aero-acoustics, rotorcraft aero-structural and aero-acoustic couplings, unsteady combustion, reentry aerothermodynamics and material response, and the like. Initially, such routine MDAs would focus on portions of an entire vehicle (around 2020) and would transition to the treatment of the entire system around 2025. A number of capabilities also must be developed in order to enable MDAO with and without the presence of uncertainties (robust and reliability-based design). A major research component that is likely to span a significant period (2015 -2025) is the work needed to endow industrial strength CFD solvers with both gradient calculation and uncertainty quantification capabilities for use in multidisciplinary optimization. Some of this work has been described in the “Numerical Algorithms” section. For the gradient/ sensitivity analysis capability, we envision that the CFD solver will be able to compute this information for full unsteady flows for the turbulence models available at the time. Finally, all these new capabilities must come together on a series of MDAO grand-challenge demonstrations in the 2030 timeframe. Recommendations In order to effectively execute the CFD development plan described above and achieve the goals laid out in the vision of CFD in 2030, a comprehensive research strategy and set of recommendations are presented. This research strategy calls for the renewed preeminence of NASA in the area of 242 “Multiphysics

Simulations: Challenges and Opportunities”, Argonne National Lab Report ANL/MCS-TM-321, Report from Workshop sponsored by the Institute for Computing in Science (ICiS), Park City, Utah, June August, 2011.

187

computational sciences and aerodynamics, and calls for NASA to play a leading role in the pursuit of revolutionary simulation based engineering. Aerospace engineering has had a long history of developing technology that impacts product development well beyond the boundaries of aerospace systems. As such, NASA is a critical force in driving technology throughout aerospace engineering directly by fulfilling its obligation. Computational methods are a key example of this broad impact, as NASA has historically been a leader in the development of structural finite-element methods, computational fluid dynamics, and applications of HPC to engineering simulations. NASA’s effort must be targeted toward research and technology development that can make revolutionary impacts on simulation-based engineering in the aerospace sciences. In particular, the current state of CFD is such that small, incremental improvements in existing capability have not had revolutionary effects. In an environment of constrained resources, this will require that NASA evaluate its activities with a critical eye toward supporting those efforts whose impact could be revolutionary. To ensure that the technology plan and roadmap are as effective as possible, we propose specific recommendations (see Figure 10.2). Naturally, individual research thrusts affect multiple technical areas, which in turn affect the ability to meet various milestones and progress toward the GC problems. GEOMETRY AND GRID GENERATION • CAD access and interfaces • Large scale parallel mesh generation • Adaptive mesh refinement • Curved mesh elements for higher‐order

HPC • Increasing access to leading‐edge HPC hardware • Porting of current and future codes to leading‐edge HPC • Radical emerging HPC technologies

Figure 10.2

MDAO • Interfaces and standards • Accurate and stable coupling techniques • UQ support and sensitivities (systemlevel)

PHYSICAL MODELING • RANS turbulence modeling • Hybrid RANS‐LES modeling a. Improved RANS component b. Seamless interface • LES (wall‐modeled and wall‐resolved) • Transition • Combustion • Radically new modeling approaches

KNOWLEDGE MANAGEMENT • Visualization • Data‐base management • Variable fidelity models

NUMERICAL ALGORITHMS • Advances in current algorithms for HPC • Novel discretizations a. Higher‐order methods b. Low dissipation/dispersion schemes c. Novel approaches (foundational) • Solvers a. Linear and non‐linear scalable solvers b. Enhancements for MDAO and UQ • UQ a. Define aerospace uncertainties b. Leverage kn own techniques c. Improved error estimation techniques d. Statistical approaches

Proposed New Computational Sciences Program Structure

188

HPC Envisioned by Department of Energy (DOE) The aim here is whether or not to pursue the main issues raised by ‘going to the exascale’, and to provide some guidance on the level of risk involved in pursuing243, and not pursuing, this direction of high performance computing. ‘Going to the exascale’ will mean a radical change in computing architecture basically. It vastly increasing the levels of parallelism to the point of millions of processors working in cycle which will force radical changes in how hardware is designed. It will dictate in how we go about solving problems (e.g., the application codes), and in how we marry application codes to the underlying hardware (e.g., the compilers, I/O, middleware, and related software tools). Understanding the advantages to be gained by going to the exascale, and evaluating the risks involved by going down this path, requires both an evaluation of past experiences in moving from the megaflop era to the present petaflop era, as well as an assessment of the readiness of advanced applications to take transformative advantage of exascale computing. The challenges inherent in developing exascale computing as a practical endeavor are considerable, and significant investments will be needed to accomplish this. What is Exascale Computing? Exascale computing refers to computing systems capable of at least one exaFLOPS, or a billion x billion calculations per second244. Such capacity represents a thousand fold increase over the first petascale computer that came into operation in 2008245. (One exaflops is a thousand petaflops or a quintillion, 1018, floating point operations per second.) At a supercomputing conference in 2009, Computerworld projected exascale implementation by 2018. Exascale computing would be considered as a significant achievement in computer engineering, for it is believed to be the order of processing power of the human brain at neural level (functional might be lower). It is, for instance, the target power of the Human Brain Project246. Why Exascale? The most obvious question, the key question really is of course: why go to the exascale? This question is not meant in the trivial sense that one would pose for any expenditure whatsoever in leading edge computing technologies, but rather is motivated by the fact that the transition from current petascale computing to the exascale will involve investments across the board from hardware to fundamental algorithms, programming models, compilers, and application codes that will dwarf previous levels of investment made as computer architectures have evolved in the past. That is, we recognize that the values to society extracted from this change in computing paradigm has to be commensurate with the costs of developing this type of computing and given the substantial costs, we need to be sure that the extracted values are similarly substantial. We will make the argument in the following that the extracted values are in fact very large but will do so in two stages, first by making some general points about the present frontiers of computing independent of discipline and then by focusing on a few example disciplines to illustrate the more general point. Range of Applications may be Transformed by Going to the Exascale As discussed earlier, a key question to be addressed in considering going to the exascale is the readiness of key applications to take this step, as well as the likelihood that taking this approach will The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 243

From Wikipedia, the free encyclopedia. National Research Council, “The potential impact of high-end capability computing on four illustrative fields of science and engineering”, The National Academies. p. 11. ISBN 978-0-309-12485-0. 246 The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 244 245

189

lead to transformative changes in these application areas. This question is addressed in the present section, focusing once again on a selection of disciplines to illustrate the breadth of applications that are ready for this transition. 10.4.3.1 Aerospace, Airframes and Jet Turbines Computing at an extreme scale will have transformational effects on several key applications in the aerospace industry247. The move from RANS to LES as the industry standard and its use in the design cycle represents a paradigm shift for the aerospace industry. In addition, there are several outstanding scientific problems in these sectors that can be understood and hopefully controlled using extreme scale computing. The accuracy achieved with the RANS approach for prediction of quantities of engineering interest in the airframe industry has reached a plateau owing to the epistemic uncertainties inherent in such turbulence models. As a result, the design of aircraft and propulsions systems relies on an iterative process where several expensive prototypes are constructed and tested in wind tunnels. Hybrid RANS/LES approaches with grounding in the first principles can overcome the limitations of RANS and enhance the predictive capability of CFD beyond the present seemingly stagnant state of speculative trial-and-error in design248. In addition, building a complete flight-envelope characterization (accounting for irreducible uncertainties, e.g. angle-ofattack, flight conditions, and geometry) will only be possible with computing at the Exascale and beyond. Such a design framework for aerodynamically optimized vehicles and propulsion systems is a critical resource for the design and construction of next generation aircraft and propulsion systems. Figure 10.3 provides estimates of the computing requirements to meet these design goals to address several Grand Challenges in aerospace systems where the computer speed and memory

Figure 10.3

Computer speed and memory requirements for the Grand Challenge

See Above. Wall-Modeled LES (WM-LES) and hybrid RANS-LES methods provide a clear path to first-principles design of next-generation aircraft as exascale computing arrives. Transitioning this technology to future exascale platforms will have a transformative impact upon simulation-based engineering design, making possible the design of aerodynamically optimized vehicles including integrated effects of propulsion, structures, and active controls, a “Grand Challenge” of aerodynamic design. 247 248

190

requirements for analysis and design of airfoils, wings, and complete aircraft for three different stages of approximation. One of the major problems confronting the aircraft industry is the aerodynamic noise generated by engine exhaust jets and airframes, particularly during take-off and landing approaches. The noise problem has been a major issue for high-speed commercial aircraft and more recently for military aircraft, both for impact on communities surrounding airports and military bases, and on the crew stationed on aircraft carrier decks. It is known that turbulence is a major contributor to aircraft noise. Unfortunately, modern optical diagnostic techniques are far from adequate in measuring the spatialtemporal data needed to reveal the mechanics of aerodynamic noise; only high-fidelity simulation techniques, such as LES, are capable of predicting both the far-field noise as well as details of the noise generating turbulent eddies. Exascale computing would have transformational impact on the discovery of the mechanics of noise generation, and would be instrumental in designing noise mitigation strategies. Figure 10.4 shows the turbulent flow from a supersonic exhaust jet (M = 1.7) obtained from a breakthrough state of the art LES computation in 2010. This first-of-a-kind of calculation lacks high-fidelity representation of the flow inside the nozzle, and the agreement with the measured noise data is only fair, presumably due to this inadequate grid resolution. As exascale computing tools become available, high-fidelity tools would not only be used to understand and predict flow-generated noise, they will be used to learn how to control it. Such demonstration calculations have been extremely computer intensive, and limited to very simple flows. Exascale computing would be the enabling technology for complex flow control and shape optimization (e.g., of aircraft wings and nozzle exits), potentially leading to a major transformational effect on the aerospace industry. The other outstanding technical problems in the gas-turbine industry is the migration of hot fluid parcels from the combustor to the turbine. The hot-streak migration is a limiting factor in the design of turbines, as turbine blades, designed based on mean flow temperatures, are damaged severely when encountering the migrating hot-spots. High-fidelity simulation of the flow inside the combustor of a jet engine is a daunting task due to the multi-physics phenomena present. Even in the modern LES computations of combustors using petascale class computers, reduced order models are used for critical phenomena such as primary atomization of the injected liquid fuel into micron size droplets, the evaporation process of the droplets and the chemical mechanisms involved. Exascale computing

Figure 10.4

A supersonic Jet Engine Nozzle Rapidly Accelerates High-Pressure Gas into the Atmosphere

191

would be the enabling technology for simulation of the jet engine combustors based on first principles, which in turn promises to facilitate the discovery of mitigating strategies for the suppression of the hot-streak migrations249. 10.4.3.2 Combustion Reliable prediction requires, for example, the incorporation of heterogeneous kinetics with quantified uncertainties in turbulent combustion simulations for processes such as soot formation/burnout and increased fidelity coupling of high-pressure, low-temperature chemistry with turbulent transport and these vital enhanced modeling techniques will only be feasible at exascale computing performance levels. In particular, combustion scientists must focus on the science underlying the development of non-petroleum based fuels, including carbon-neutral biofuels, and their optimal use in transportation. This science intrinsically involves chemistry with transport at conditions far from equilibrium and at extreme pressures and a coordinated multi-scale approach for understanding and predicting combustion in turbulent environments250. Combustion in practical devices covers a myriad of time and length scales, from the scale of the electron to those of the largest scales of turbulence dependent upon the geometry of the device. To tackle this daunting challenge and complexity, a multi-scale approach is adopted wherein experiments, theory and direct computation are brought to bear on a limited range of scales (4-5 decades) and fundamental physical insights gained are encapsulated in reduced-order parameterizations that are used to upscale knowledge to bridge the scales. Several high-fidelity computational approaches in both the atomistic and continuum regimes utilize petascale computing. Exascale computing would greatly facilitate higher fidelity or access to more practically relevant parameter regimes (e.g., higher pressure, higher turbulence levels, and more complex fuels). In the continuum regime where turbulence scales interact with flame, ignition, and mixing scales turbulence-chemistry interactions are important. Virtually all combustion devices operate under turbulent environments due to enhanced mixing and greater efficiency. Many of the fundamental turbulence chemistry interactions are amenable to investigation by first principles direct numerical simulation (DNS) and high-fidelity large-eddy simulation (LES) of building block, laboratory scale flows. Whereas DNS focuses on the fully resolving the fine-grained physics, LES resolves the energycontaining end of the turbulence spectrum down to a specified cut-off in the inertial or dissipative end of the spectrum and the unresolved sub-grid scales are modeled. As such these methods are complementary. Both DNS and LES require the horsepower of high-performance supercomputing at the exascale and beyond to resolve all relevant flow and chemical scales. Exascale simulations are required, for example, to understand the coupling between low-temperature ignition kinetics and turbulent mixing at high pressure that determines lifted flame stabilization, ignition timing, rate of combustion, and emissions characteristics. Understanding complex low-temperature high pressure kinetics of alternative fuels and its coupling with turbulent transport at high pressure requires much greater resolution and the transport of large numbers of reactive scalars only afforded by extreme scale computing power. Moreover, in-situ reduction strategies for accurate and computationally affordable inclusion of heterogeneous kinetics with quantified uncertainties in DNS and LES are required. The insights gained from exascale simulations will enable the development of predictive multi-scale models to optimally design future evolving fuels and engines. Future predictive simulation tools running on exascale computing systems will enable deep understanding of underlying chemical and combustion science processes, enhance combustion engine design and performance, and ultimately yield a dramatic reduction in engine development timescales, time to market, and development costs, while ensuring the timely achievement of energy The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 250 See Previous. 249

192

security and emissions goals, and enhancing the competitiveness of U.S. engine manufacturers and fuel producers. 10.4.3.3 Climate Modeling Although substantial uncertainty exists as to the degree and impacts of future climate change, especially at local and regional scales, it is generally agreed that significant adaptation will be required. Furthermore, the magnitude of climate change later in the century depends upon the near and intermediate-term mitigation strategies used to reduce the emission of greenhouse gases. These strategies also must satisfy an increasing energy demand of a growing global population experiencing an improvement in its standard of living. Predicting these future climate changes and evaluating the effects of mitigation strategies require Earth system models (ESMs) that are far more accurate and comprehensive than those in use today. Integrated assessment models provide the framework for climate predictions by defining the emissions scenarios and elucidating the relationships among the natural and human systems that are at the core of climate change studies. In the next decade, integrated assessment and comprehensive ESMs will probably be combined into a single system that could be used to investigate scientific issues and to formulate policy options for adaptation and mitigation. The predictions from integrated ESMs will be most credible if the important processes in the climate system, for example mixing by ocean eddies, are simulated at their native spatial and temporal scales. Critical organized features in the atmosphere and ocean including clouds and eddies have characteristic sizes of 1 to 10 km. Some of the major sources of uncertainty in climate predictions from existing models are associated with the aggregate effects of these phenomena. Experience with current climate models suggests that simulation of climate change with a model with 10-km grid resolution is inherently a petascale problem. In fact, even higher resolution is required to resolve these features with sufficient fidelity to the physical principles underlying their formation and evolution. Since the computational cost increases nonlinearly with higher resolution, it is likely that predictions of societal and environmental change at 1-km resolution would require truly extreme scale computers. 10.4.3.4 Computational Biology The ultimate goal of exascale computing applications to challenges in modern biology is to go from atoms to organs or from microbes to ecosystems: for example, to enable an understanding of how the brain works as an energy efficient, biologically-based information system, or to understand microbial processes and their impact on the geosphere. In the process, these newly enlarged scales of computing will resolve unfathomably complex research issues in a host of fields as diverse as neuroscience and microbial metagenomics. At exascale, new scalable tools that admit a variety of time, space and trajectory sampling methods (and fully exploit the hundreds of millions of cores of an exascale machine) will enable long time integrations, implicit solvation conditions, and mixed molecular mechanics and quantum mechanics models, to allow breakthrough science. For example, a large biochemical network within a full-scale model of a eukaryotic cell could be modeled in the span of a few hours. It is important to note that the first million-atom simulation in biology was conducted just five years ago an all-atom simulation of the ribosome conducted at Los Alamos National Laboratory. This million particle simulation milestone had already been achieved a decade prior in materials science and cosmology (computational scientists in both these fields now perform multibillion-particle simulations). While biology researchers have achieved impressive methodological advances that permit the modeling of the largest assemblies in the cell, it is only for short periods of time. And, these simulations are unlikely to scale to the size of a single cell, even a small bacterium, for relevant times such as minutes or hours even if researchers can employ computers capable of achieving 1,000 petaflops/s. Today, researchers are currently limited to the microsecond timescale for protein

193

folding required by the huge number of intermolecular, interaction computations. Scientists also lack rigorous coarse grained models that permit the scaling up of macromolecular pathways and supramolecular cellular processes. Similarly, systems biology methods lack the dynamic resolution needed for coupling genomic and other data in order to fully map cellular networks, to predict their functional states, and to control the time varying responses of living cells. Nor can current kinetics models adequately analyze the dynamics of complex living systems. Exascale computing will be needed to achieve those capabilities. Within the next decade, scientists expect to have the complete genome sequence of more than 10,000 bacteria and archaea and other single-celled microbes. Exascale computing platforms will make it possible in principle to systematically reconstruct the metabolic networks of all sequenced microbes through automated comparative analysis, to reconstruct their regulatory networks by integrating a variety of data sources, and to combine these reconstructions into functional models of cellular states. Exascale computing will be critical to make this a routine class of computation such that it can become part of the standard way we analyze genomes in the future. 10.4.3.5 Materials Science Materials innovations are central to many of the technological advances responsible for the quality of life and prosperity. In fact, many of the disruptive technological advances since the turn of the last century modern transportation, medical treatments and prosthetics, space exploration, global communication, computers and the electronics industry used advances arising from every corner of the materials world: metals, ceramics, semiconductors, polymers, and novel combinations of these. Materials establish and support entire industries, and tens of millions of manufacturing jobs depend on the availability of these advanced materials at affordable costs. A quantifiable understanding of novel materials and their response is central as well to the technological challenges facing our country. Whether it is ceramics for high-efficiency automobiles, photovoltaics for next-generation solar power or smart alloys for efficient building construction, the nation requires the development of advanced materials with superior properties that will drive the next generation of technologies. In the highly competitive global marketplace that we find ourselves, minimizing time to solution and time to market is crucial. It is instructive to consider two workhorse techniques for materials modeling hydrodynamics and molecular dynamics and examine the reasons why a simulation might fail to provide sufficiently useful information. Molecular dynamics simulations are characterized by a force field or potential, involving many adjustable parameters, which describes the interactions between atoms. There are no parameters required to describe the response of the materials, however all the constitutive response emerges naturally from the interaction potentials. Such calculations are currently limited in size to fractions of a cubic micron simulated for 10’s of nanoseconds, even on the largest computers. Hydrodynamics, by comparison, involves many adjustable parameters describing both interaction and the materials response. However, there is no real size or time limit in the simulation. There is a practical lower limit on resolution, as it makes no sense to model a atomically sized region of space using continuum equations. At a given level of computing, computational scientists using either method encounter two common barriers to success: (a) the largest (or most finely resolved) simulation possible is still too small (or too poorly resolved) to capture the relevant behavior of interest, or (b) the most complex, compute-intensive simulation that can be solved in a reasonable time is still too simple or approximate to adequately describe the physics of interest. In many cases both (a) and (b) are true which is particularly damning, since it prevents the investigator from performing the traditional trade-off between these two constraints: very often, one makes simplifying approximations to enable a larger simulation or investigates smaller systems in order to perform a more complicated calculation. On the other hand, investigating grain formation using molecular dynamics may not be possible, even in the simplest metals on today’s computers. The availability of an exascale platform will move the

194

location of the constraints, allowing quite generally more detailed calculations of more complex materials. State of-the-art calculations involving billions of atoms have been performed that demonstrate the ability to model macroscopic (i.e., continuum) materials behavior with an atomistic model that makes no assumptions about the cooperative response. Figure 10.5 shows a detail view of 9 Billion-atom molecular dynamics simulation of a developing Kelvin-Helmholtz instability at the sheared interface between aluminum and copper. With the development of an exascale computer it is Figure 10.5 Detail View of 9-Billion Atom possible that such a calculation (which was heroic Molecular Dynamics Simulation Instability on a petascale computer) could be performed on demand during a hydrodynamics calculation, determining, for example, the equation of state for a mixed region at precisely the temperature, pressure and composition that was required. By tabulating this information as it is generated, one can envision that such a simulation would teach itself as it runs, learning only those regions of this three dimensional phase space that is needed. 10.4.3.6 Nuclear Engineering Recent studies have reviewed the status and basic science, challenges, opportunities, and research needs for advanced nuclear energy systems, with specific attention to the role of predictive modeling and simulations (M&S) in addressing the difficulties posed by the radioactive materials and harsh environments found in these systems:  

Computational M&S offers the opportunity to accelerate nuclear energy development by simulating complex systems to evaluate options and predict performance, thus narrowing the technology path and optimizing testing requirements. Today’s high-performance computational systems are capable of modeling complete reactor systems and related technologies; the availability of exascale systems will enable high-fidelity M&S that can further improve the performance of existing reactors and have a significant positive impact on both the design and the operation of future reactors.

Simulation has the potential for addressing the critical needs of advanced nuclear energy systems by providing the tools necessary for safety assessments, design activities, cost, and risk reduction. One can, for example, imagine virtual prototyping of reactor cores yielding data that leads to more accurate identification of design margins, allows early experimentation with novel design concepts, and ultimately significantly reduces plant certification timelines. In other areas, such as advanced fuel fabrication, atomistic fuel simulations could ultimately make it possible to target a small subset of promising candidate fuel types for further experimentation, greatly reducing the number of experiments to be performed. A simulation-based methodology is within reach with exascale computers. The scope of the M&S tools needed to support the design, analysis and engineering of next-generation nuclear energy systems is daunting: 1. 2. 3. 4.

Integrated 3D reactor core simulations with rigorous propagation of uncertainty; Coupled thermal hydraulic and primary loop simulation; Advanced fuel design and performance; Fuel behavior engineering;

195

5. 6. 7. 8. 9.

Advanced secondary loop and balance of plant engineering and analysis; Advanced fuel cycle design; Separations facility engineering optimization; Repository design including seismic, geological, chemical, and thermal modeling and simulation; Overall nuclear energy systems model development suitable for alternative economic analysis.

Spent fuel reprocessing is very complicated with a large number of different materials, multiple pathways must be considered; waste streams must be treated; improve coupling between computations and experiments must occur. Reprocessing occurs at high temperature, and is in dire need of better multi-scale M&S. The opportunities for impact on reprocessing with exascale M&S abound. These include developing new separation agents, full-scale plant simulations using first principles, integrating multiple codes, and separations simulations. Empirical understanding does not lead to appropriate scale up it will instead require exascale computing. Some of the payoffs for exascale computation include: reduced R&D cost and time; improved/accelerated design; process scaleup; reduced facility cost; opportunity for major change; and waste form design. Many challenges confront viable and useful (predictive) M&S of fuel performance. These include the ability to reduce fuel development and qualification time, assess life cycle performance, address safety concerns, predict fuel rod behavior in design basis accident (DBA), and predict current and advanced (e.g., transuranic) fuel behavior. Important effects and requirements to incorporate include material properties, swelling, microstructural phase change, thermal properties, crack formation and mechanical property change. High-fidelity modeling of fuel performance is inherently multiscale, e.g., the effects of point defects and fission products must be considered. Exascale platform requirements drivers in fuel performance can be quantified. Opportunities for exascale M&S of existing and future advanced reactors include eliminating unrealistic assumptions that drive to more conservative designs and thus higher installation cost, helping to achieve higher power efficiencies, a reduction of learning curves to get efficiencies, helping to reduce the required number of repositories, improving safety posture, optimizing design of the power grid and the fuel cycle and better (more efficient) operations, including in-line monitoring and operator training. There are numerous issues confronting advanced reactor M&S today. The core is a coupled physics problem (not currently being done very well today) and the full system needs to be analyzed in one tool. Current reactor designs are excessively conservative. 10.4.3.7 Others Disciplines Other frequently mentioned disciplines which will be realty impacted by Exascale are:   

Astrophysics Fusion Energy National Security

Users should consult the report “The Opportunities and Challenges of Exascale Computing” of USDOE for further information. Challenges in Going to the Exascale Creating an exascale computer capable of effectively running the applications just described will require significant R&D breakthroughs. The previous section laid out the case for the wide range of scientific and technical advances that could be made with an exaflop computer. This section discusses the challenges required to make that three order of magnitude jump in technology. In this type of discussion, it is often far too easy to talk about that jump as some quantitative steps in an evolutionary process, when in fact the jump implies significant qualitative changes in the way solutions must be approached. Consider the following Table 10.1 illustrates three orders of

196

magnitude in change. The analogy to computing challenges is not quite the same, because we do not have to explore totally different technologies to make the leap in three orders of magnitude. However, just like we would not think of asking a marathon runner to explore the solar system, we cannot use current technology to produce an Exaflop system. Below we highlight the important steps necessary to take this giant step while users could consult the [Report on Exascale Computing]251 for additional information. Technology

Quantitative Rate

Qualitative Change

Marathon Runner

10 mph

Explore a town

Car

100 mph

Explore a country

Jet

1000 mph

Explore a world

Space Craft

10000 mph

Explore the solar System

Table 10.1

Three Order of Magnitude Jump

10.4.4.1 The Hardware Challenges The architectural challenges for reaching exascale are dominated by power, memory, interconnection networks and resilience. Table 10.2 compares current HPC designs with potential exascale designs from the DOE252. The baseline we need is a factor of 500 change in peak system performance. The difference in factor changes for the various components show where simple scaling of systems (e.g., buying 500 2 Pf/s systems) will be inadequate.

Table 10.2

Potential Exascale Computer Design for 2018 and its relationship to current HPC designs (DOE)

The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 252 See Previous. 251

197

Take for example, the power line in the table. While the peak speed goes up by 500, the power cost cannot go up by more than a factor of 3. That means that the power solution for an exaflop system has to be over 150 times more efficient than current technology. That is a huge challenge. Looking through the other entries, the table clearly echoes the sentiments of the IAA, and highlight key features that must be addressed in hardware or downstream in software. Other potential challenges are:     

Exaflop hardware needs major R&D progress Power System Memory Data Movement System Resiliency

10.4.4.2 The Applied Mathematics Challenges The applied mathematics component of an exascale program should include attention to activities with time horizons ranging from medium term to very long term, where both ends of the time scale are essential. The description “medium-term” is deliberate because experience in adapting to new computational modalities shows that short-term, one-off strategies are likely to be wasteful. Even though much remains unknown about the details of exascale systems, a clear medium-term priority is the definition and implementation of algorithms that are scalable at very large levels of parallelism (such as on million-core machines) and that remain sufficiently fast under different hardware decisions about bandwidth and latency. Scalability should be modeled and analyzed mathematically, using abstractions that represent key architectural ingredients. Simulations and experiments that indicate the effects of hardware and software perturbations on algorithmic efficiency can then guide the definition of methods that retain scalability under a variety of hardware scenarios. In this spirit, the strategies for applied mathematics in exascale science will require sustained support over time for people-intensive activities, early identification of the hardest (and least straightforward) research problems, and built-in flexibility to pursue unexpected and promising new directions as they arise. Some other points important but not discussed here are: 10.4.4.3 Mathematical Modeling It is natural for those developing mathematical models of practical problems to limit themselves to formulations that can be solved numerically using currently available methods. Although essential when the problem needs to be solved in the short term, an ab initio focus on feasibility can create a too-rigid environment in which non-standard or “blue-sky” formulations are either never thought about or else summarily rejected. For example, a problem formulation that represents many realworld problems yet tends to be avoided because of its known intractability is constrained nonlinear optimization with a mixture of continuous, discrete, and categorical variables. But the prospect of massive increases in computational power means that modeling ideas previously dismissed as impossible or impractical may well become realistic, and should be carefully examined and analyzed. Creative rethinking of mathematical models is an essential strategy to address the challenges of exascale science. The highly desired “transformational” changes flowing from exascale computing are most likely to come from new formulations that change the way we think about problems, rather than from applying more resources to an existing formulation to obtain a more accurate solution or to solve a larger problem. Mathematical models are inherently an approximation of reality, and an exascale initiative provides an opportunity to loosen the grip of, or even remove, computationallyimposed simplifications. The major challenge is to devise models that capture the important details of physical and engineered systems as they really are. This will almost certainly generate much harder sub-problems and/or much more data, but the gains are likely to be eminently worthwhile.

198

10.4.4.4 Numerical Algorithms The need for scalable algorithms in an exascale initiative has already been stressed. Another essential feature, highlighted in a 2009 talk by Kathy Yelick called Ten ways to waste a parallel computer, is a “back to basics” approach to reformulation. Without careful analysis of both new models and new numerical methods, there is the risk of significant inaccuracy or large computational overhead in unexpected parts of the overall solution process, as illustrated in the following two examples related to numerical methods for partial differential equations: 1. All indications are that memory will become the rate-limiting factor along the path to exascale, and investments should accordingly be made in designing algorithms with reduced memory requirements. Examples where this work is appropriate include: i.

ii. iii.

Algorithmically scalable matrix-free methods (e.g., multigrid) for sparse systems of equations, where “algorithmically scalable” means that the total resources needed to solve the problem (flops plus memory) are proportional to the resources needed to evaluate the associated operator; High-order methods that perform more computation to obtain greater accuracy for each computational degree of freedom; Adaptive models/methods designed to use the smallest possible number of degrees of freedom to obtain the needed level of accuracy.

2. Many calculations related to DOE missions involve models that depend on both space and time. In current methods, obtaining better spatial resolution typically requires a comparable reduction in the time step. A frequent argument for exascale science is that it will allow much finer spatial resolution in numerous application domains, with (for example) meshes reduced in size by a factor of ten. Unfortunately, simply reducing mesh spacing by a factor of ten could lead to a ten-fold increase in the time for solution, even with perfect weak scaling. Several strategies, all in the spirit of rethinking, should be explored to avoid this inefficiency. For example, models can be made more implicit to avoid restrictive time-step conditions arising from stiff processes that rapidly relax to equilibrium (e.g., in the context of low Mach-number fluid flows). A further strategy is aggressive use of sub-cycling in time for processes that are fast, but either are localized in physical space or involve only a small subset of the variables in state space. A motivating example here is advection in the jet stream in atmospheric modeling. Approaches of this flavor across the spectrum of numerical methods will almost certainly lead to increased algorithmic complexity, in addition to the daunting software-related challenges discussed. The substantially greater work needed to devise exascale numerical methods and software leads us to observe that, for decades, there has been, roughly speaking, a dichotomy in the wish list for the mathematical software used to solve scientific and engineering problems. On one hand, many DOE scientists have neither time nor inclination to become experts in numerical methods and software techniques, preferring to leave software development to mathematicians and computer scientists. On the other hand, some scientists and engineers want to become deeply involved in producing domain-specific methods and software to attain the highest possible efficiency for their particular problem. An exascale science program needs to address the needs of both these groups. For the first, “professional” mathematical software and libraries (meaning software developed by mathematicians and computer scientists for relatively generic problems such as solving linear systems or eigenvalue problems) should be developed for increasingly broad problem categories as we move toward exascale. In this way, domain scientists will be able to use state-of-the-art software components that can be shared across multiple application domains. Since writing software is universally recognized

199

to be time consuming and error-prone, scientists and engineers will benefit from availability of software that they can use off the shelf while experimenting with domain-specific challenges rather than writing their own sparse matrix package. For the second group, specific scientific case studies should be identified that require significant involvement of domain scientists, mathematicians, and computer scientists in end-to-end software development.

 Mathematics for massive data  Machine learning  Compressive sampling  Symbolic computing 10.4.4.5 The Algorithmic Challenges Advancing science in key areas requires development of next-generation physical models to satisfy the accuracy and fidelity needs for targeted simulations. The impact of these simulation fidelity needs on requirements for computational science is twofold.  

First, more complex physical models must be developed to account for more aspects of the physical phenomena being modeled. Second, for the physical models being used, increases in resolution for key system variables, such as numbers of spatial zones, time steps or chemical species, are needed to improve simulation accuracy, which in turn places higher demands on computational hardware and software.

Application models represent the functional requirements that drive the need for certain numerical algorithms and software implementations. Science priorities lead to science models, and models are implemented in the form of algorithms. Algorithm selection is based on various criteria, such as appropriateness, accuracy, verification, convergence, performance, parallelism and scalability. Moving forward to exascale will put heavier demands on algorithms in at least two areas:  

the need for increasing amounts of data locality in order to perform computations efficiently, the need to obtain much higher factors of fine-grained parallelism as high-end systems support increasing numbers of compute threads.

As a consequence, parallel algorithms must adapt to this environment, and new algorithms and implementations must be developed to extract the computational capabilities of the new hardware. Significant new model development, algorithm re-design and science application code reimplementation, supported by exascale-appropriate programming models, will be required to exploit efficiently the power of exascale architectures. The transition from current sub-petascale and petascale computing to exascale computing will be at least as disruptive as the transition from vector to parallel computing in the 1990’s. Uncertainty quantification will permeate the exascale science workload. The demand for predictive science results will drive the development of improved approaches for establishing levels of confidence in computational predictions. Both statistical techniques involving large ensemble calculations and other statistical analysis tools will have significantly different dynamic resource allocation requirements than in the past, and the significant code redesign required for the exascale will present an opportunity to embed uncertainty quantification techniques in exascale science applications. Some other points are:  

New multicore-friendly and multicore-aware algorithms Adaptive Response to Load Imbalance

200

        

Multiple precision algorithms/software Communication avoiding Fast implicit solvers Auto-tuning Scheduling and memory management for heterogeneity and scale Fault tolerance and robustness for large-scale systems Building energy efficiency into algorithms foundations Sensitivity analysis Multiscale/multi-physics modeling

10.4.4.6 Computer Science Challenges The coming transition in computer architectures as peak capability approaches the exascale offers both challenges and opportunities253. The challenges involve a paradigm shift in programming methodologies. Existing technologies for writing parallel scientific applications have sustained HPC application software development for the past decade and have been successful for Petascale computing, but were architected for coarse-grained concurrency largely dominated by bulk synchronous algorithms. Future hardware constraints and growth in explicit on-chip parallelism will likely require a mass migration to new algorithms and software architecture that is as broad and disruptive as the migration from vector to parallel computing systems that occurred 15 years go. The applications and algorithms will need to rely increasingly on fine-grained parallelism, strong scaling, and fault resilience. Addressing these challenges opens up a renewed opportunity to introduce a higher level of software engineering into current fusion application subsystems that will enhance the modularity, portability, and performance of codes while extending their capabilities to new levels. At the same time, past sound investments must be protected, and a migration path from current to future environments must be elaborated. Some other themes are:     

Programming Models I/O Getting There from Here Tools Fault Tolerance

10.4.4.7 Educational Challenges Major challenges in exascale science include the building of understanding and awareness among groups with high prestige in both academia and industry, and the dearth of highly competent young scientists in this field, two issues that are not entirely unrelated. Many of the reasons for these problems are reasonably well understood, but not easily dealt with. Application scientists who focus primarily on building computational tools are sometimes regarded by their scientific community as not being “real” scientists. This phenomenon is particularly noticeable in both physics and chemistry, reflecting in part the penetration of “community codes”. From the opposite perspective, high-level software designers and programmers may not welcome or appreciate the contributions made by scientific disciplines to building state-of-the-art computational tools. On the bright side, interest in computational science and engineering worldwide has measurably increased during the past 15 years. Almost no universities, even those with faculty working on computational science and engineering, have, or are likely to develop, a curriculum that focuses on issues associated with exascale science. In The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 253

201

addition, as our subcommittee has noted already, many of the issues in exascale science are not yet understood, which means that a straightforward program of training in the usual sense is impossible. Exascale hardware and its features will keep changing, so that training people too early to think about specific hardware configurations is a bad idea. However, it is important to start soon to lay the foundations for future thinking about exascale science. To be successful, an exascale science education and training program needs to be devised and managed with creative flair, not business as usual254.

The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 254

202

11 Artificial Intelligence in CFD Artificial Intelligence (AI) is the broadest way to think about advanced, computer intelligence. In 1956 at the Dartmouth Artificial Intelligence Conference, the technology was described as such: "Every aspect of learning or any other feature of intelligence can in principle be so precisely described that a machine can be made to simulate it." AI can refer to anything from a computer Machine Learning program playing a game of chess, to a voice-recognition system like Amazon's Alexa interpreting and responding to speech. IBM's Deep Blue, which Neural Networks beat chess grand master Garry Kasparov at the game in 1996, or Google DeepMind's Alpha Go, are examples of AI. According to Deep Learning HackerEarth Blog, AI can be classified into the following (see Figure 11.1):

Artificial Intelegence

  

Machine Learning Neural Networks Deep Learning

Figure 11.1

Scope of Artificial Intelligence (Courtesy of Hackerearth Blog)

Machine Learning Before we pay tribute to the field of machine learning in CFD, it best to go briefly of what is machine learning itself. Machine learning is a type of Artificial Intelligence (AI) that provides computers with the ability to learn without being explicitly programmed. Machine learning focuses on the development of computer programs that can change when exposed to new data. The process of machine learning is similar to that of data mining. Both systems search through data to look for patterns. However, instead of extracting data for human comprehension as is the case in data mining applications machine learning uses that data to detect patterns in data and adjust program actions accordingly. Machine learning algorithms are often categorized as being supervised or unsupervised. Supervised algorithms can apply what has been learned in the past to new data. Unsupervised algorithms can draw inferences from datasets. Facebook's News Feed uses machine learning to personalize each member's feed. If a member frequently stops scrolling in order to read or "like" a particular friend's posts, the News Feed will start to show more of that friend's activity earlier in the feed. Behind the scenes, the software is simply using statistical analysis and predictive analytics to identify patterns in the user's data and use to patterns to populate the News will be included in the data set and the News Feed will adjust accordingly. Google and Amazon are other heavy users of Machine Learning. Difference Between Artificial Intelligence and Machine Learning Artificial Intelligence (AI) is a computer program that does something smart. It can be a pile of statements or a complex statistical model. Usually, when a computer program designed by AI researchers actually succeeds at something; like winning at chess many people say it's "not really intelligent", because the algorithms internals are well understood. So you could say that true AI is

203

whatever computers can't do yet. Machine learning, as others here have said, is a subset of AI. In short, Machine learning is a science that involves development of self-learning algorithms. These algorithms are more generic in nature that it can be applied to various domain related problems. Machine learning uses statistics (mostly inferential statistics) to develop self-learning algorithms. Artificial Intelligence is a science to develop a system or software to mimic human to respond and behave in a circumstance. As field with extremely broad scope, AI has defined its goal into multiple chunks. Later each chuck has become a separate field of study to solve its problem. The "learning" part of Machine Learning means that ML algorithms attempt to optimize along a certain dimension; i.e. they usually try to minimize error or maximize the likelihood of their predictions being true. How does one minimize error? Well, one way is to build a framework that multiplies inputs in order to make guesses as to the inputs' nature. Different outputs/guesses are the product of the inputs and the algorithm. Usually, the initial guesses are quite wrong, and if you are lucky enough to have groundtruth labels pertaining to the input, you can measure how wrong your guesses are by contrasting them with the truth, and then use that error to modify your algorithm. That's what Artificial Neural Networks (ANN) do. They keep on measuring the error and modifying their parameters until they can't achieve any less error. They are, in short, an optimization algorithm. If you tune them right, they minimize their error by guessing and guessing and guessing again.

Deep Learning

Artificial Neural Networks (ANN) are inspired by our understanding of the biology of our brains all those interconnections between the neurons255. But, unlike a biological brain where any neuron can connect to any other neuron within a certain physical distance, these artificial neural networks have discrete layers, connections, and directions of data propagation. You might, for example, take an image, chop it up into a bunch of tiles that are inputted into the first layer of the neural network. In the first layer individual neurons, then passes the Deep data to a second layer. The second Learning layer of neurons does its task, and so on, until the final layer and the final output is produced. Each neuron assigns a weighting to its input; how Machine correct or incorrect it is relative to Lerning the task being performed. The final output is then determined by the Artifical total of those weightings. So think of Inteligence our stop sign example. Attributes of a stop sign image are chopped up and “examined” by the neurons its octagonal shape, its fire-engine red Figure 11.2 Schematics of AI, Machine Learning and Deep color, its distinctive letters, its trafficLearning sign size, and its motion or lack thereof. The neural network’s task is to conclude whether this is a stop sign or not. It comes up with a “probability vector,” really a highly educated guess, based on the weighting. In our example the system might be 86% confident the image is a stop sign, 7% confident it’s a speed limit sign, and 5% it’s a kite stuck in a tree ,and so on and the network architecture then tells the neural network whether it is right or not. Michael Copeland, “What’s the Difference Between Artificial Intelligence, Machine Learning, and Deep Learning?”, July 2010. 255

204

In short, Deep Learning is a technique for implementing Machine Learning. Deep Learning has enabled many practical applications of Machine Learning and by extension the overall field of AI as perceived in Figure 11.2. Deep Learning breaks down tasks in ways that makes all kinds of machine assists seem possible, even likely. Driverless cars, better preventive healthcare, even better movie recommendations, are all here today or on the horizon. Today, image recognition by machines trained via deep learning in some scenarios is better than humans, and that ranges from cats to identifying indicators for cancer in blood and tumors in MRI scans. Google’s AlphaGo learned the game, and trained for its Go match it tuned its neural network by playing against itself over and over and over.

Types of Problems and Tasks

Machine learning tasks are typically classified into three broad categories, depending on the nature of the learning "signal" or "feedback" available to a learning system. These are: Supervised Learning How it works: This algorithm consist of a target / outcome variable (or dependent variable) which is to be predicted from a given set of predictors (independent variables). Using these set of variables, we generate a function that map inputs to desired outputs. The training process continues until the model achieves a desired level of accuracy on the training data. Examples of Supervised Learning: Regression, Decision Tree, Random Forest, KNN, Logistic Regression etc.256 Unsupervised Learning In this algorithm, we do not have any target or outcome variable to predict/estimate. It is used for clustering population in different groups, which is widely used for segmenting customers in different groups for specific intervention. Examples of Unsupervised Learning: Apriori algorithm, K-means. Reinforcement Learning Using this algorithm, the machine is trained to make specific decisions. It works this way: the machine is exposed to an environment where it trains itself continually using trial and error. This machine learns from past experience and tries to capture the best possible knowledge to make accurate business decisions. Example of Reinforcement Learning: Markov Decision Process257. List of Common Machine Learning Algorithms Here is the list of commonly used machine learning algorithms. These algorithms can be applied to almost any data problem where we explain a little bit of the first four.            256 257

Linear Regression Logistic Regression Decision Tree Artificial Neural Networks (ANNs) Support Vector Machine (SVM) Naive Bayes K-Nearest Neighbors (KNN) K-Means Random Forest Dimensionality Reduction Algorithms Gradient Boost

Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015. same as previous.

205

11.3.4.1 Linear Regression It is used to estimate real values (cost of houses, number of calls, total sales etc.) based on continuous variable(s). Here, we establish relationship between independent and dependent variables by fitting a best line. This best fit line is known as regression line and represented by a linear equation Y = a ⋆ X + b. The best way to understand linear regression is to relive this experience of childhood. Let us say, you ask a child in fifth grade to arrange people in his class by increasing order of weight, without asking them their weights! What do you think the child will do? He / she would likely look (visually analyze) at the height and build of people and arrange them using a combination of these visible parameters. This is linear regression in real life! The child has actually figured out that height and build would be correlated to the weight by a relationship, Figure 11.3 Linear Regression which looks like the equation above. In this equation, Y-Dependent Variable, a-Slope, X-Independent variable and b-Intercept. These coefficients a and b are derived based on minimizing the sum of squared difference of distance between data points and regression line. Look at the below example. Here we have identified the best fit line having linear equation y = 0.2811 x+13.9 (see Figure 11.3). Now using this equation, we can find the weight, knowing the height of a person. Linear Regression is of mainly two types: Simple Linear Regression and Multiple Linear Regression. Simple Linear Regression is characterized by one independent variable. And, Multiple Linear Regression(as the name suggests) is characterized by multiple (more than 1) independent variables. While finding best fit line, you can fit a polynomial or curvilinear regression. And these are known as polynomial or curvilinear regression258. 11.3.4.2 Logistic Regression Don’t get confused by its name! It is a classification not a regression algorithm. It is used to estimate discrete values ( Binary values like 0/1, yes/no, true/false ) based on given set of independent variable(s). In simple words, it predicts the probability of occurrence of an event by fitting data to a logit function. Hence, it is also known as logistic regression. Since, it predicts the probability, its output values lies between 0 and 1 (as expected). Again, let us try and understand this through a simple example. Let’s say your friend gives you a puzzle to solve. There are only 2 outcome scenarios ; either you solve it or you don’t. Now imagine, that you are being given wide range of puzzles/ quizzes in an attempt to understand which subjects you are good at. The outcome to this study would be something like this ; if you are given a trigonometry based tenth grade problem, you are 70% likely to solve it. On the other hand, if it is grade fifth history question, the probability of getting an answer is only 30%. This is what Logistic Regression provides you. Coming to the math, the log odds of the outcome is modeled as a linear combination of the predictor variables odds = p/ (1-p) = probability of event occurrence / probability of not event occurrence. ln(odds) = ln(p/(1-p)), logit(p) 258

Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015.

206

= ln(p/(1-p)). Above, p is the probability of presence of the characteristic of interest. It chooses parameters that maximize the likelihood of observing the sample values rather than that minimize the sum of squared errors (like in ordinary regression). Now, you may ask, why take a log? For the sake of simplicity, let’s just say that this is one of the best mathematical way to replicate a step function. It can go in more details, but that will beat the purpose of this article. 11.3.4.3 Decision Tree This is favorite algorithm and used it quite frequently. It is a type of supervised learning algorithm that is mostly set for classification problems259. Surprisingly, it works for both categorical and continuous dependent variables. In this algorithm, we split the population into two or more homogeneous sets. This is done based on most significant attributes/ independent variables to make as distinct groups as possible. In the image above, you can see that population is classified into four different groups based on multiple attributes to identify Figure 11.4 Decision Tree ‘if they will play or not’. To split the population into different heterogeneous groups, it uses various techniques (see Figure 11.4). 11.3.4.4 Artificial Neural Networks (ANNs) Computational model used in machine learning, computer science and other research disciplines, which is based on a large collection of connected simple units called artificial neurons, loosely analogous to axons in a biological brain. Connections between neurons carry an activation signal of varying strength. If the combined incoming signals are strong enough, the neuron becomes activated and the signal travels to other neurons connected to it. Such systems can be trained from examples, rather than explicitly programmed, and excel in areas where the solution or feature detection is difficult to express in a traditional computer program. Like other machine learning methods, neural networks have been used to solve a wide variety of tasks, like computer vision and speech recognition, that are difficult to solve using ordinary rule-based programming. Typically, neurons are connected in layers, and signals travel from the first (input), to the last (output) layer. Modern neural network projects typically have a few thousand to a few million neural units and millions of connections; their computing power is similar to a worm brain, several orders of magnitude simpler than a human brain. The signals and state of artificial neurons are real numbers, typically between 0 and 1. There may be a threshold function or limiting function on each connection and on the unit itself, such that the signal must surpass the limit before propagating. Back propagation is the use of forward stimulation to modify connection weights, and is sometimes done to train the network using known correct outputs. However, the success is unpredictable: after training, some systems are good

259

Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015.

207

at solving problems while others are not. Training typically requires several thousand cycles of interaction, (see Figure 11.5). The goal of the neural network is to solve problems in the same way that a human would, although several neural network categories are more abstract. New brain research often stimulates new patterns in neural networks. One new approach is use of connections which span further to connect processing layers rather than adjacent neurons. Other research being explored with the different types of signal over time that axons propagate, such as deep learning, interpolates greater complexity than a set of Boolean variables being simply on or off. Newer types of network are more free flowing in terms of stimulation and inhibition, with connections interacting in more chaotic and complex ways. Dynamic neural networks are the most advanced, in that they dynamically can, based on rules, form new connections and even new neural units while disabling others. Historically, the use of neural network models Figure 11.5 Artificial Neural Network (ANN) marked a directional shift in the late 1980s from high-level (symbolic) artificial intelligence, characterized by expert systems with knowledge embodied in if-then rules, to low-level (sub-symbolic) machine learning, characterized by knowledge embodied in the parameters of a cognitive model with some dynamical system. A simple example provided below demonstrates a better explanation. 11.3.4.5 Case Study - Prediction of the Maximal Wall Shear Stress (MWSS) Value for Carotid Artery Bifurcation Steady state simulations for 1886 geometries were undertaken and MWSS values were calculated for each of them. This dataset was used for training and testing following data mining algorithms; knearest neighbors, linear regression, neural network: multilayer perceptron, random forest and support vector machine. The results are based on Relative Root Mean Square (RMSE) n

RM SE 

 (f i 1 n

 (f i 1

i

 fˆi ) 2

i

 fi )

, 2

f i  desired value (target)   ˆ  f i  predicted value (predicted using dataminig algorithm) f  average value (average value of M WS for all 1886 samples) i 0  RM SE  1

Eq. 11.1 Visualization of the global importance of features used for modeling MWSS. The horizontal axis of each diagram denotes the values of particular feature and the vertical axis denotes the respective average contribution value for that particular feature value. The application of the model explanation methodology results in quantitatively describing how much features and their individual values, on average, influence the target prediction

Model

RMSE

K-Nearest Neighbors Linear Regression Neural Network

0.760 0.748 0.140

Random Forest Support Vector Machin

1.127 0.612

Table 11.1

Results of Different Methods

208

values of the model. Visualization of the global importance of features used for modeling MWSS. The horizontal axis of each diagram denotes the values of particular feature and the vertical axis denotes the respective average contribution value for that particular feature value. The application of the model explanation methodology results in quantitatively describing how much features and their individual values, on average, influence the target prediction values of the model. (See Table 11.1 and Figure 11.6).

Figure 11.6 Maximal Wall Shear Stress (MWSS) Value for Carotid Artery Bifurcation

Machine Learning in Fluid Dynamics Time-varying fluid flows are ubiquitous in modern engineering and in the life sciences. Particularly challenging is the characterization of unsteady aerodynamic forces and moments as they play critical roles in, for instance, biological propulsion and bio-inspired engineering design principals. It is observed that birds, bats, insects, and fish routinely harness unsteady fluid phenomena to improve their propulsive efficiency, maximize thrust and lift, and increase maneuverability. Such observations are highly suggestive, leading to conjectures about the existence of low-dimensional structures in fluid flows. Machine learning aims to capitalize on such dominant patterns of spatial-temporal activity. When integrated with more traditional first-principals simulation, reduced-order models can be developed to accurately quantify fluid flows. Evolution Motivation and Objectives Flow control has been a fundamental concept in fluid mechanics research in this century. We develop flow modeling and optimization techniques using biologically inspired algorithms such as Artificial Neural Networks (ANN) and evolution strategies. The applications presented herein encompass a variety of problems such as cylinder drag minimization, neural net modeling of the near wall structures, enhanced jet mixing, and parameter optimization in turbine blade film cooling. The unifying concept is the utilization of automated processes for the solution of these problems, devised from machine learning algorithms. The results presented herein encompass a wide variety of

209

problems such as drag minimization, neural net modeling of the near wall structures, enhanced jet mixing, and parameter optimization in turbine blade film cooling. a challenging problem that, when solved, could lead to drastically improved designs. We envision neural network approaches as an effective way of developing such models and incorporating them in feedback control algorithms. In We present some preliminary results from the application of (ANN) as a method to construct low order models, describing the near wall dynamics in turbulent flows. Neural Networks are viewed as a general procedure of model formation encompassing schemes such as the Proper Orthogonal Decomposition (POD). Design and Optimization Issue Another key issue in the effort to reduce time to market of new engineering designs is the optimization of the design parameters in an efficient manner. The design cycle usually involves multiobjective and multi-disciplinary optimization problems, requiring the iterative solution of empirical formulas, the appropriate integration of numerical simulations, and the incorporation of physical understanding of the various aspects of the problem. At the same time, the optimization cycle of the physical problem must take into consideration financial and manufacturing constraints. In flow related problems, this optimization cycle has benefited from advances in optimization theory which usually aim at tackling the most costly aspects of the optimization problem such as the solution of the Navier-Stokes equations. Powerful techniques such as the adjoint procedure have been implemented successfully in the design cycle of aircrafts. However, such optimization strategies are usually based on the efficient calculation of gradients of functions relating the quantity to be optimized to the parameters of the problem. Such gradients are not always readily available as often the optimization cycle would involve empirical formulas and cost functions that are difficult to express analytically in terms of the optimization problem. Moreover, gradient based algorithms are usually converging to local extrema. Therefore, the result strongly depends on the initial selection of parameters. Evolution strategies [Rechenberg]260 are optimization techniques that avoid the problems associated with the use of gradients as they require only the calculation of the cost function at each point in the parameter space. They operate based on natural principles of evolution such as mutation, recombination, and selection. These operations are adapted so that the algorithm automatically develops and attempts to optimize a model landscape relating the cost function to its parameters. Compared with gradient based techniques, their convergence rate is usually much lower, thus requiring large numbers of iterations that could be unrealistic for some problems of engineering interest. On the other hand, they are highly parallel algorithms that efficiently exploit today's powerful parallel computer architectures and they are more likely than gradient based algorithms to identify a global optimum. This latter aspect makes them attractive in many engineering applications where the fitness landscape cannot be assumed unimodal. Accomplishments Data methods are certainly not new in the fluids community. Computational fluid dynamics has capitalized on Machine Learning efforts with dimensionality-reduction techniques such as proper orthogonal decomposition or dynamic mode decomposition, which compute interpretable low-rank modes and subspaces that characterize spatial-temporal flow data261. Proper Orthogonal Decomposition (POD) and Dynamic Mode Decomposition (DMD) are based on the singular value decomposition which is ubiquitous in the dimensionality reduction of physical systems. When coupled with Galerkin projection, POD reduction forms the mathematical basis of reduced-order Rechenberg, I., “Evolutions strategie: Optimierung technischer systeme nach prinzipien der biologischen evolution”. Fromann-Holzboog, Stuttgart, 1973. 261 Holmes, P., Lumley, J. & Berkooz, G., ”Turbulence, Coherent Structures, Dynamical Systems and Symmetry”, Cambridge University Press, 1998. 260

210

modelling, which provides an enabling strategy for computing high-dimensional discretization of complex flows262. The success of dimensionality reduction in fluids is enabled by  

Significant performance gains in computational speed and memory, Generation of physically interpretable spatial and/or spatial-temporal modes that dominate the physics.

Thus computations are enabled and critical physical intuition gained. Such success is tempered by two well-known failings of POD/DMD based reductions:  

Their inability to capture transient, intermittent and/or multi-scale phenomenon without significant tuning, Their inability to capture invariances due to translation, rotation and/or scaling.

ANNs are almost diametrically opposed in their pros and cons. Specifically, ANNs are well suited for extracting multi-scale features as the ANN decomposition shares many similarities with wavelet decompositions, which are the computational work horse of multi-resolution analysis. Moreover, translations, rotations and other invariances are known to be easily handled in the ANN architecture. These performance gains are tempered by the tremendous computational cost of building a ANN from a large training set and the inability of ANN to produce easily interpretable physical modes and/or features. Field Inversion and Machine Learning in Support of Data Driven Environment A machine learning technique such as an Artificial Neural Network (ANN) can adequately describe by its field inversion on data driven context. The Calibration Cases (offline data) where few configuration data (DNS or Experimental data) such as the one showing in Figure 11.7. The Prediction cases (Machine Learning with no data) has similar configuration with different; (1) Twist, (2) Sweep angles, and (3) Airfoil shape263. The challenge in predictive modeling, however, is to extract an optimal model form that is sufficiently accurate. Constructing such a model and demonstrating its predictive capabilities for a class of Figure 11.7 Calibration Cases for Off Line problems is the objective. Data 11.4.4.1 Artificial Neural Networks (ANNs) The functional relationship b(η), where η = [η1; η2, , , , ηM]T are input features derived from meanfield variables that will be available during the predictive solution process. The functional relationship must be developed by considering the output of a number of inverse problems representative of the modeling deficiencies relevant to the predictive problem. Further, as explained below, elements of the feature vector η are chosen to be locally non-dimensional quantities. The standard NN algorithm operates by constructing linear combinations of inputs and transforming 262 Benner, P., Gugercin, S. & Willcox, K., “ A survey of projection-based model reduction methods for parametric

dynamical systems”, SIAM Rev. 57, 483–531, 2015. 263 Heng Xiao, “Physics-Informed Machine Learning for Predictive Turbulence Modeling: Status, Perspectives, and Case Studies”, Machine Learning Technologies and Their Applications to Scientific and Engineering Domains Workshop, August 17, 2016.

211

them through nonlinear activation functions264. The process is repeated once for each hidden layer (marked blue in Figure 11.8) in the network, until the output layer is reached. Figure 11.8 presents a sample ANN where a Network diagram for a feed-forward NN with three inputs, two hidden layers, and one output. For this sample network, the values of the hidden nodes z1,1 through z1,H1 would be constructed as

 3 1  z1,i  a   w i, jηi   i 1  1

Eq. 11.2

where a1 and w1i,j are the activation function and weights associated with the first hidden layer, respectively. Similarly, the second layer of hidden nodes is constructed a

 H1 2  z 2, j  a   w i, jz1,i   i 1  2

Eq. 11.3

Finally, the output is

 H2 3  y  f( )  a   w i, jz 2,i   i 1  3

Eq. 11.4

Given training data, error back-propagation algorithms265 are used to find wnij . Once the weights are found, computing the output depends only on the number of hidden nodes, and not on the volume of the training data. Hyper-parameters of the NN method include the number of hidden layers, the number of nodes in each hidden layer, and the forms of the activation functions. Typically, 3 layers and about 100 nodes were employed with a sigmoid activation function.

Figure 11.8

Network Diagram for a feed-forward NN with three inputs and one output

The POD as Linear Artificial Neural Network (LANN) 264 Anand Pratap Singh, Shivaji Medida, Karthik Duraisamy, “Machine Learning-augmented Predictive Modeling

of Turbulent Separated Flows over Airfoils”, Nov 2016. 265 Zhang, Z. J. and Duraisamy, K., “Machine Learning Methods for Data-Driven Turbulence Modeling,” 22nd AIAA Computational Fluid Dynamics Conference, AIAA Aviation, (AIAA 2015-2460), Dallas, TX, Jun 2015.

212

A model reduction can be accomplished by projecting the model equations, i.e. the Navier-Stokes equations, on a properly selected lower dimensional phase subspace. A reasonable choice for a proper selection criterion for the base of this manifold is the maximization of the energy content of the projection266. This can be done by applying the Karhunen-Loeve decomposition to a data set that is representative of the dynamics of the system that we wish to approximate. This operation is called Proper Orthogonal Decomposition (POD)267. The linear POD is an approximation of the flow vector v by a finite expansion of orthonormal functions φn such that: n

v  V   a n (t)n (x) i 1

Eq. 11.5

where V is the time averaged flow, φn is the set of the first n eigenvectors of the covariance matrix C = E [(vi−V )(vj −V )]; when this representation for v is substituted in the Navier Stokes equations, the original PDE model is transformed in an ODE model, composed by n equations. The POD can be expressed as a multi-layer feed-forward neural network. Such a network is defined by the number of layers, the specification of the output function for the neurons in each layer, and the weight matrices for each layer. [Baldi and Hornik]268 have shown that training a linear neural network structure to perform an identity mapping on a set of vectors is equivalent to obtaining the POD of this set of vectors. A neural network performing the linear POD can be specified as a 2 layer linear network:

x  W1v

vˆ  W2 x

,

Eq. 11.6

where ^v is the reconstructed field, v is the original flow field, having N components, x is the reduced order representation of the field, having n components, and W1 and W2 are the network weight matrices, of sizes N x n and n x N respectively. Non-linearity can be introduced by a simple extension to this basic network:

x  W2 tanh(W1v)

,

vˆ  W4 tanh(W3 x)

Eq. 11.7

This corresponds to a neural network model with 4 layers: the first one, with an m x N weight matrix W1, nonlinear; the second one, with an n x m weight matrix W2, linear; the third one, also nonlinear, with an m x n weight matrix W3, and the last one, linear with an N x m weight matrix W4. However, the resulting system of ODEs is more involved as compared to the one resulting from the application of the linear POD. 11.4.5.1 POD and Nonlinear ANN A simple comparison of POD and nonlinear ANN is provided by the reconstruction of the velocity field in the stochastically forced Burger's equation a classical 1D model for turbulent flow [Chambers]269. The linear POD was used to obtain a set of 256 linear Eigen functions using 10000 S. M¨uller , M. Milano and P. Koumoutsakos, “Application of machine learning algorithms to flow modeling and optimization”, Center for Turbulence Research Annual Research Briefs 1999. 267 Berkooz, G., Holmes, P. & Lumley, J. L. “The proper orthogonal decomposition in the analysis of turbulent flows”,Ann. Rev. Fluid Mech. 25, 539-575, 1993. 268 Baldi, P. & Hornik, K., “ Neural networks and principal component analysis: Learning from examples without local minima”. Neural Networks. 2, 53-58, 1989. 269 Chambers, D. H., Adrian R. J., Moin, P. & Stewart, S.,”Karhunen-Loeve expansion of Burgers model of turbulence”. Phys Fluids. 31, 2573-2582, 1998. 266

213

snapshots extracted from a simulation. Using the first 7 Eigen functions it is possible to reconstruct the original flow field, keeping the 90 percent of the energy. A nonlinear neural network was trained on the same data set to perform the identity mapping: this network is composed by 256 inputs and 4 layers having respectively 64 nonlinear neurons, 7 linear neurons, 64 nonlinear neurons, and 256 linear neurons. For validation purposes, a data set of 1000 snapshots, not used in the training phase, was used. In Figure 11.9 it is possible to appreciate the reconstruction performances of both the approaches; the proposed nonlinear ANN clearly outperforms the linear POD (top) using a velocity field in Burgers equation.

Figure 11.9

Comparison of linear POD (top) and Neural Networks (bottom)

Overview of ANNs in Turbulence Applications Turbulent flows generally exhibit multi-scale (spatial and temporal) physics that are high dimensional with rotational and translational intermittent structures also present. Such data provide an opportunity for ANN to make an impact in the modelling and analysis of turbulent flow fields.

True

Figure 11.10

Machine Learning

Skin Friction Coefficient for Onera M6 match to within 2%

214

[Kurzawski & Templeton]270 have proposed using ANNs for Reynolds averaged Navier Stokes (RANS) models which are widely used because of their computational tractability in modelling the rich set of dynamics induced by turbulent flows. In this highlighted body of work, the specific aim is to use ANNs to build an improved representation of the Reynolds stress anisotropy tensor from highfidelity simulation data. Remarkably, despite the widespread success of ANNs at providing highquality predictions in complex problems, there have been only limited attempts to apply deep learning techniques to turbulence. Thus far, these attempts have been limited to a couple hidden layers. Figure 11.10 shows Skin Friction Coefficient for Onera M6 wing to be matched within 2%271. The Future of ANNs for Fluids Modelling ANNs will almost certainly have a transformative impact on modelling high dimensional complex systems such as turbulent flows. The successes with many complex data sets will compel researchers to utilize this rapidly emerging data analysis tool for improving predictive capabilities. ANNs represent a paradigm shift for the community. Whereas many innovations have often been inspired from expert-in-the-loop intuition and physically interpretable models, ANNs have challenged these traditional notions by building prediction engines that simply outperform competing methods without providing clear evidence of why they are doing so. To some extent, the application of ANNs to turbulent flows will bring awareness to the fluids community of the two cultures of statistics and data science. These two outlooks are centered around the concepts of machine learning and statistical learning. The former focuses on prediction (ANNs) while the latter is concerned with inference of interpretable models from data (POD/DMD reductions). Although both methodologies have achieved significant success across many areas of big data analytics, the physical and engineering sciences have primarily focused on interpretable methods. Despite its successes, significant challenges remain for ANNs. Simple questions remains: 1. 2. 3. 4.

How many layers are necessary for a given data set? How many nodes at each layer are needed? How big must my data set be to properly train the network? What guarantees exist that the mathematical architecture can produce a good predictor of the data? 5. What is my uncertainty and/or statistical confidence in the ANN output? 6. Can I actually predict data well outside of my training data? 7. Can I guarantee that I am not overfitting my data with such a large network? And the list goes on. These questions remain central to addressing the long-term viability of ANNs. The good news is that such topics are currently being intensely investigated by academic researchers and industry (Google, Facebook, etc.) alike. Undoubtedly, the next decade will witness significant progress in addressing these issues. From a practical standpoint, the work of determine the number of layers and nodes based upon prediction success, i.e. more layers and more nodes do not improve performance. Additionally, cross-validation is imperative to suppress overfitting. As a general rule, one should never trust results of a ANN unless rigorous cross-validation has been performed. Crossvalidation plays the same critical role as a convergence study of a numerical scheme. Given the computational maturity of ANNs and how readily available they are (see Google’s open source software called TensorFlow), it is perhaps time for part of the turbulence modelling community to adopt what has become an important and highly successful part of the machine learning culture: 270 Ling, J.,

Kurzawski, A. & Templeton, J. “Reynolds averaged turbulence modelling using deep neural networks with embedded invariance”. J. Fluid Mech 807, 155–166, 2016. 271 Karthik Duraisamy, “A Framework for Turbulence Modeling using Big Data”, NASA Aeronautics Research Mission Directorate (ARMD) LEARN/Seedling Technical Seminar January 13-15, 2015.

215

challenge data sets.

Some Preliminary Concepts in Quantum Computation Quantum computers are incredibly powerful machines that take a new approach to processing information. Built on the principles of quantum mechanics, they exploit complex and fascinating laws of nature that are always there, but usually remain hidden from view. By harnessing such natural behavior, quantum computing can run new types of algorithms to process information more holistically. They may one day lead to revolutionary breakthroughs in materials and drug discovery, the optimization of complex manmade systems, and artificial intelligence. We expect them to open doors that we once thought would remain locked indefinitely. Acquaint yourself with the strange and exciting world of quantum computing. What is Quantum Computing? Nature, including molecules like caffeine, follows the laws of quantum mechanics, a branch of physics that explores how the physical world works at the most fundamental levels. At this level, particles behave in strange ways, taking on more than one state at the same time, and interacting with other particles that are very far away. Quantum computing harnesses these quantum phenomena to process information in a novel and promising way. The computers we use today are known as classical computers. They’ve been a driving force in the world for decades advancing everything from healthcare to how we shop. But there are certain problems that classical computers will simply never be able to solve. Consider the caffeine molecule in a cup of coffee. Surprisingly, it’s complex enough that no computer that exists or could be built would be capable of modeling caffeine and fully understanding its detailed structure and properties. This is the type of challenge quantum has the potential to tackle. How do Quantum Computers work and what they can do? Classical computers encode information in bits. Each bit can take the value of 1 or 0. These 1s and 0s act as on/off switches that ultimately drive computer functions. Quantum computers, on the other hand, are based on qubits, which operate according to two key principles of quantum physics: superposition and entanglement. Superposition means that each qubit can represent both a 1 and a 0 at the same time. Entanglement means that qubits in a superposition can be correlated with each other; that is, the state of one (whether it is a 1 or a 0) can depend on the state of another. Using these two principles, qubits can act as more sophisticated switches, enabling quantum computers to function in ways that allow them to solve difficult problems that are intractable using today’s computers. Quantum systems may untangle the complexity of molecular and chemical interactions leading to the discovery of new medicines and materials. They may enable ultra-efficient logistics and supply chains, such as optimizing fleet operations for deliveries during the holiday season. They may help us find new ways to model financial data and isolate key global risk factors to make better investments. And they may make facets of artificial intelligence such as machine learning much more powerful. Classical vs. Quantum Computing A classical computer takes information, usually as a list of ones and zeros, i.e., bits, (such as electrical signals which can be at two different voltage levels), and uses electronic circuitry to process this information. It performs a set of pre-determined calculations, which can be broken down into socalled "gate operations" in which the state of some bits is changed based on the known value of other bits. The computer then outputs the final result, again as a string of definite ones and zeros. A quantum computer would take information encoded on a quantum state, and then perform predetermined "gate operations" according to the laws of quantum mechanics, and produce a new quantum state, which can be measured to determine the outcome of the computation.

216

Figure 11.11

Difference Processing Between Classical and Quantum Computer

Qubits, and Power of a Quantum Computer The key difference between the two lies in the ability of a quantum state to represent many possible "classical" states at the same time. Where a classical bit can either be a "0" or a "1", a quantum bit is instead any possible combination or "superposition" of "0" and "1", with complex numbers as coefficients of the superposition. That is, where a "bit" of information in a classical computer can be represented by two points ("0" or "1"), a "qubit" in a quantum computer is instead represented as any point on the surface of a 3D sphere the "Bloch Sphere". Not only can each qubit be in such a superposition state, but the system as a whole can be in a superposition of every combination of different states of all the qubits. This is why a quantum computer could be so immensely powerful: every possible state could be stored, and processed in parallel with all the others. The number of possible states that can be present in the superposition is huge - if we have N qubits then there are 2N possible states in the superposition. A quantum computer with just 30 qubits would have Figure 11.12 The Bloch Sphere is a Representation of a 1,073,741,824 possible states, and a qubit, the fundamental building block of quantum quantum computer with 300 qubits would computers have roughly the same number of possible states as the total number of atoms in the known universe. Quantum Algorithms: Programming a Quantum Computer There are two difficulties with quantum computers: Determining how to program such a system, and learning how to build one. Programming a quantum computer is made more difficult by the laws of measurement in quantum mechanics: when we measure the system we don't obtain every possible result, but rather the measurement collapses the state onto one useable outcome. Because of this, it isn't easy to design algorithms that make use of the intrinsic power of a quantum computer. They are

217

also difficult to write, because they are mathematically more complex and much less intuitive than algorithms for a classical computer. The first algorithm for a quantum computer was demonstrated by Peter Shor in 1994, and could be used to find the factors of a number of length n using of the order of log(n) operations. This algorithm makes use of the fact that quantum computers would be good at finding the period of a periodic function, which can be related by number theory to the problem of factorizing a number. To give a comparison with classical computers, if we assume that a fast supercomputer would take about a year to factor a 150-digit number, then the same computer would require roughly the lifetime of the universe to factor a 400-digit number. The acceptance that factoring scales like this is the basis of public key encryption systems, which are used every day for secure transactions made over the internet. The security of the system relies on the fact that the "public key", which is a large number, cannot be factored to find the "private key", which is made up of the factors of that number. If a quantum computer could be built that would factor a 150-digit number in a month, then the same quantum computer could factor a 400-digit number in about a year. Of course, this might be bad news for current encryption schemes - but quantum information science also provides new "replacement" encryption schemes to overcome this issue. Since Shor's algorithm, many new algorithms have been developed, giving significant (though not always exponential) speedups for different problems, including search problems, simulated annealing, and quantum monte-carlo algorithms that would help in determining the properties of certain many-body quantum systems. However, development of such algorithms is still a growing field. 11.5.5.1 Could Quantum Computing Methods Improve Iterative Calculations in CFD? Rendering to some researchers and CFD experts, like [Roopesh Mathur], it is too early to tell if quantum computing will have an impact. The algorithms and software infrastructure to exploit the new computing platform have not been developed yet. Therefore, quantum computing will take a very long time to grow into a practical tool for the CAE field. Another point of view, [Victor Eijkhout] who wrote a book in subject, quantum computing seems to be largely limited to integer computing. If that's true then you could phrase your floating point computation in fixed point, which is as good as integer. And yet another researcher also agrees with above mentioned analysis and confers that most of the present quantum computers are not generalized ( i.e. they are just based on a particular optimization problems). For the next 2 decades , the development will be like the ones that happened on silicon computer in the 70’s. but also considering the predictions given by the experts of the quantum computers, it will play a paradigm shift in reducing the complexity of the analysis of vast data generated by the simulations. Thus if general quantum computer (which when scaled to even 1,00,000 qubits will endanger the profession on not only CFD, but also other analytic jobs in all sciences (most afraid are computer scientist). But don’t worry, the biggest enemy of becoming this scenario a reality is that “NOISE” and “SCALABLITY” issues (i.e. all the qubits are not mutually coupled at a particular time). Although Dwave has made a 2K qubit computer (which Theoretically has power to exceed the computation power of all the TOP 1000 supercomputers combined). But it’s true computation is much less (but still it is Impressive as it can beat them in some calculations ) as it is bogged down by these problems. Thus In short, most probably next two decades will be taken by the quantum scientist to develop the architecture of a fully scalable quantum computer and also the defining the hardware that will address the issue. In the meantime, the advances in the classical computers (increasing speed-ups by powerful GPU’s, Machine/Deep learning techniques) are sufficient to solve most problems of research in CFD272.

272

https:www.quora.com

218

11.5.5.2 Quantum Speedup for Turbulent Combustion Simulations Having said that, still there are numbers of problem in CFD which invites the quantum computation The original objective of the QCFD team’s proposal is to develop quantum computation as a tool for future large eddy simulation (LES) of turbulent reacting flows. Classical LES based on the filtered density function (FDF) methodology can be used for prediction of both low and highspeed propulsion systems. The primary advantage of FDF is that it provides a systematic means of sub-grid scale (SGS) closure in reacting flows. This closure is cast in terms of modeled stochastic differential equations (SDEs). Our principal goal will be to develop quantum speedups for the simulation of these SDEs. If successful, this research could make a revolutionary impact on future prediction of systems such as gas turbine engines and scramjet combustors. 11.5.5.3 Large Eddy Simulation (LES) and Filtered Density Function (FDF) Turbulent reacting flows are the subject of significant interest to military and to many industries, with broad applications in propulsion systems and combustion devices. It is now widely recognized that LES provides the optimal means of capturing the detailed unsteady physics of such flows. The primary challenge in LES is accurate modeling of the SGS quantities. The filtered density function (FDF) methodology; including its mass weighted form, the filtered mass density function (FMDF), has proven particularly effective for this purpose. The FDF is the counterpart of the probability density function (PDF) methods in Reynolds averaged simulations, commonly referred to as the Reynoldsaveraged Navier-Stokes (RANS). The FDF essentially provides the PDF of the SGS quantities. In its stand-alone form, it must account for the joint statistics of five SGS variables: energy, pressure, frequency, velocity, and all of the other scalar variables. As noted above, the overall objective of this proposal is to introduce quantum computing for FDF simulation. This idea appears promising because the essential means of enacting FDF is via modeled SDEs which portray the essential physics of the SGS. These SDEs describe all of the basics transport variables and account for couplings of turbulence, variable density, and also differential diffusion in both low and high-speed flows.

219

12 Appendix A Routine for Inverse Distance Weighted Interpolation (Shepard’s Method) #include #include #include #include #include "cse.h" #define Large 1.0e+30 #define Small -1.0e+30 void RHS (double dx[],double dy[],double dz[],int num,double dum_x, double dum_y,double dum_z,double omega[]) { int i ; double dx_new , dy_new , dz_new , DX , DY , DZ; dum_x = dum_y = dum_z = 0.0 ; for ( i = 0 ; i < num ; i++ ){ dx_new = dx[i] ; dz_new = dz[i] ; if (dz_new hmax ) ? h[i]: hmax ; omega[i] = 1.0 ; }

220

for ( i = 0 ; i < num ; i++ ){ F[i] = ((hmax -h[i])/hmax*h[i]) * ((hmax -h[i])/hmax*h[i]) ; F2 += F[i] ; }

}

for ( i = 0 ; i < num ; i++ ){ omega[i] = F[i]/F2 ; if (h[i] == 0.0 ) omega[i] = 1.0 ; if (omega[i] < 0.0 || omega[i] > 1.0 ) { fprintf(stderr, " omega[i] = %.3lf \n", omega[i]); fprintf(stderr, " Error - The weight function should be between 0 and 1\n"); exit(1) ; } }

double Get_R (double x[],double y[],double z[],int num ) { double xmax,ymax,zmax,xmin,ymin,zmin,R; xmax = ymax = zmax = Small ; xmin = ymin = zmin = Large ; for ( int i = 0 ; i < num ; i++ ){ xmax = (x[i]> xmax ) ? x[i]: xmax ; ymax = (y[i]> ymax ) ? y[i]: ymax ; zmax = (z[i]> zmax ) ? z[i]: zmax ; xmin = (x[i]< xmin ) ? x[i]: xmin ; ymin = (y[i]< ymin ) ? y[i]: ymin ; zmin = (z[i]< zmin ) ? z[i]: zmin ; } return R = sqrt ((xmax-xmin)*(xmax-xmin) + (ymax-ymin)*(ymax-ymin) + (zmax-zmin)*(zmax-zmin)) ; } void EIDW (int num_interface_nodes1, int num_nodes1, int num_outer_nodes1) /* current global search is insufficient and CPU intensive and should be modified to a more localized search method */ { double dum_x,dum_y,dum_z, omega[num_interface_nodes1],h[num_interface_nodes1], xx[2],yy[2],zz[2],xmin,xmax,ymin,ymax,zmin,zmax,R_solid,R_fluid,hmin, d1,d2,d3,box; int num_nodes,num_interface_nodes,line,i_bar,i,j,ii,n_solid; bool skip ; double dx[num_interface_nodes1],dy[num_interface_nodes1],dz[num_interface_nodes1]; double x[num_nodes1],y[num_nodes1],z[num_nodes1];

221

// Read the data (test - VTK format / CSE ) int index = 0 ; FILE* file_ptr ; file_ptr = fopen("dum0", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum0 file.\n") ; exit (1) ; } int n0 ; fscanf (file_ptr, "%d\n", & n0); for (line = 0 ; line < n0/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ;

}

for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; }

index = n0 ; printf(" Finish reading the dum0 file. \n"); fclose (file_ptr) ;

;

file_ptr = fopen("dum1", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum1 file.\n") ; exit (1) ; } int n1 ; fscanf (file_ptr, "%d\n", &n1); for (line = 0 ; line < n1/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; }

} index = n0+n1; dx[index-1]= -4.6567497253 ; dy[index-1]= -0.0067161722109 ; dz[index-1]= -0.55055594444 ; printf(" Finish reading the dum1 file. \n");

222

fclose (file_ptr) ; //FILE* file_ptr ; file_ptr = fopen("dum2", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum2 file.\n") ; exit (1) ; } int n2 ; fscanf (file_ptr, "%d\n", &n2); for (line = 0 ; line < n2/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ; for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; int index = n0+n1 ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0+n1+ n2; printf(" Finish reading the dum2 file. \n"); fclose (file_ptr) ;

;

//FILE* file_ptr ; file_ptr = fopen("dum3", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum3 file.\n") ; exit (1) ; } int n3 ; fscanf (file_ptr, "%d\n", &n3); for (line = 0 ; line < n3/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; int index = n0+n1+n2 ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0+n1+ n2+n3; num_interface_nodes = index ; printf(" Finish reading the dum3 file. \n");

223

fclose (file_ptr) ; printf(" n0 n1 n2 n3 = %d,%d,%d,%d \n", n0,n1,n2,n3); printf(" num_interface_nodes = %d \n", num_interface_nodes); file_ptr = fopen("internal", "r") ; if (file_ptr == NULL) { printf (" ** Error opening internal file\n.") ; exit (1) ; } fscanf (file_ptr, "%d\n", & num_nodes); // check for memory int *cfdpointer ; cfdpointer = (int*) malloc (sizeof(num_nodes)); if (cfdpointer == NULL) { printf (" num_nodes = %d \n", num_nodes); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } for (line = 0 ; line < num_nodes/2 ; line++) { fscanf (file_ptr," %lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ; for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; x[i] = xx[i_bar] ; y[i] = yy[i_bar] ; z[i] = zz[i_bar] ; } } fclose (file_ptr) ; printf(" Finished reading the internal file. \n"); printf(" num_nodes = %d \n", num_nodes); // get max /min xmax = ymax = zmax = Small ; xmin = ymin = zmin = Large ; for ( i = 0 ; i < num_interface_nodes ; i++ ){ xmax = (dx[i]> xmax ) ? dx[i]: xmax ; ymax = (dy[i]> ymax ) ? dy[i]: ymax ; zmax = (dz[i]> zmax ) ? dz[i]: zmax ; xmin = (dx[i]< xmin ) ? dx[i]: xmin ; ymin = (dy[i]< ymin ) ? dy[i]: ymin ; zmin = (dz[i]< zmin ) ? dz[i]: zmin ; }

224

R_solid = Get_R (dx,dy,dz,num_interface_nodes) ; R_fluid = Get_R (x,y,z,num_nodes) ; // loop for each cfd field ( EIDW - Shepard's Method) n_solid = 0 ; box = R_fluid; for (j = 0 ; j < num_nodes ; j++) { skip = false ; double xf = x[j] ; double yf = y[j] ; double zf = z[j] ; // get Euclidian distances and normalized weights hmin = Large ; for ( i = 0 ; i < num_interface_nodes ; i++ ){ d1 = xf - dx[i] ; d2 = yf - dy[i] ; d3 = zf - dz[i] ; h[i] = sqrt(d1*d1 + d2*d2 + d3*d3) ; hmin = (h[i] < hmin ) ? h[i]: hmin ; } printf (" pass 1 j = %d \n",j) ; // get weight function values get_weight (h , omega , num_interface_nodes) ; RHS (dx,dy,dz,num_interface_nodes,dum_x,dum_y,dum_z,omega) ; // update new field positions for cfd if (skip) { x[j] = xf ; y[j] = yf ; z[j] = zf ; } else { x[j] = dum_x + x[j] ; y[j] = dum_y + y[j] ; z[j] = dum_z + z[j] ; } } printf (" end of big loop .....\n"); // output (test - VTK format / CSE ) file_ptr = fopen("fluid_mesh_new", "w") ; if (file_ptr == NULL) { printf (" ** Error opening cfd_mesh_new file to write.") ; exit (1) ; } printf (" trying to write \n") ;

225

printf (" cfd.num_nodes = %d \n", num_nodes) ; for (line = 0 ; line < num_nodes/2 ; line++) { for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { ii = i_bar ; if (line > 0 ) ii = line * 2 + i_bar ; xx[i_bar] = x[ii]; yy[i_bar] = y[ii]; zz[i_bar] = z[ii]; } fprintf (file_ptr," %.11f %.11f %.11f %.11f %.11f %.11f\n", xx[0],yy[0],zz[0],xx[1],yy[1],zz[1]) ; } fclose (file_ptr) ; printf(" Done...\n"); } int main() { #define num_interface_nodes1 30000 #define num_nodes1 400000 #define num_outer_nodes1 50000 // check memory requirements int * Workarray = NULL; Workarray = (int*) malloc (sizeof(num_interface_nodes1)); if (NULL == Workarray) { printf (" num_interface_nodes1 = %d \n", num_interface_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } Workarray = (int*) malloc (sizeof(num_nodes1)); if (Workarray == NULL) { printf (" num_nodes1 = %d \n", num_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } Workarray = (int*) malloc (sizeof(num_outer_nodes1)); if (Workarray == NULL) { printf (" num_outer_nodes1 = %d \n", num_outer_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } EIDW (num_interface_nodes1,num_nodes1,num_outer_nodes1) ; return 0 ; }