Special Topics in CFD

1

CFD Open Series Revision: 1.85.4

a

Special Topics in CFD Ideen Sadrehaghighi,

Ph.D.

Parallel Processing & HPC

Errors Analysis

Uncertainty Quantification

CFD in Biomedical Applications

Smooth Particle Hydrodynamics

Meshless Scehmes

Reduced Order Modeling

ANNAPOLIS, MD

Artificial Intelegence

2

Contents 1

2

3

Introduction ................................................................................................................................ 14

1.1 1.2 1.3 1.4

Computational Predictively plus Verification & Validation............................................................... 14 Multiscale/Multiphysics ................................................................................................................................... 15 Mesh Free Methods for CFD............................................................................................................................ 15 Integrated Simulations of Complex Systems ........................................................................................... 16

Reduced Order Modeling (ROM) ......................................................................................... 18

2.1 Various Techniques ............................................................................................................................................ 19 2.2 Common Features Shared by Reduced Order Methods (ROM) ....................................................... 19 2.3 Reduced Basis Methods .................................................................................................................................... 20 2.3.1 Lagrange ............................................................................................................................ 20 2.3.2 Hermit ................................................................................................................................ 20 2.3.3 Taylor ................................................................................................................................. 20 2.3.4 Snapshot Sets .................................................................................................................... 20 2.4 Proper Orthogonal Decomposition (POD) Spaces ................................................................................. 21 2.4.1 Galerkin Projection into POD Space .................................................................................. 22 2.4.2 Case Study - Vortex Shedding Around a Circular Cylinder using a POD-Galerkin Method 22 2.4.2.1 Governing Equations ............................................................................................... 22 2.4.2.2 Details of the Full Order Simulation ........................................................................ 23 2.4.2.3 Details of the ROM Simulation ................................................................................ 23 2.4.2.4 Analysis of the Results ............................................................................................. 24 2.5 Addressing Challenges in Reduced-Order Modeling ............................................................................ 25 2.6 Reduced Order CFD Simulation..................................................................................................................... 26 2.7 Case Study 1 - Designing Parameters of Test Stage Axial Turbine ................................................. 28 2.7.1 Blade Reverse Engineering as applied to Geometry Definition ........................................ 29 2.7.2 3D Aerodynamic Computation .......................................................................................... 29 2.8 Case Study 2 - Cooling Air Flow Rate .......................................................................................................... 29 2.9 Reduced Order Model using Empirical Relationship ........................................................................... 30

Computational Error and Uncertainty Quantification ................................................. 32

3.1 Classification of Errors...................................................................................................................................... 33 1.1 Physical Modeling Error ................................................................................................................................... 33 3.1.1 Uncertainty Quantification of Turbulence Models............................................................ 34 3.1.2 Single Turbulence Model with Perturbation ..................................................................... 35 3.1.2.1 Transonic Flow over the ONERA M6 Wing .............................................................. 36 3.1.2.2 Supersonic Flow Through a Converging-Diverging Seiner Nozzle ........................... 37 3.1.3 Multiple turbulence Models without Perturbation ........................................................... 38 3.2 Geometrical Modeling Errors ......................................................................................................................... 39 3.3 Spatial Discretization (Governing Equations) Errors .......................................................................... 39 3.3.1 Higher Order Discretization ............................................................................................... 41 3.4 Discretization Errors ......................................................................................................................................... 41 3.4.1 Mesh Density ..................................................................................................................... 41 3.4.2 Grid Independence Study .................................................................................................. 43 3.4.3 Grid Topology .................................................................................................................... 43 3.4.4 Sources of Discretization Error .......................................................................................... 44

3

3.4.5 Case Study – Hypersonic Flow over an Axisymmetric Sphere-Cone ................................. 44 3.4.6 Estimating Discretization Error .......................................................................................... 45 3.4.6.1 Case Study – Domain Discretization Error for the Transitional Flow over a Sharp Cone 46 3.5 Temporal Discretization Errors .................................................................................................................... 47 3.6 Iterative Convergence Errors ......................................................................................................................... 49 3.6.1 Monitoring Convergence using Residual History .............................................................. 50 3.6.2 Monitoring Quantitative Convergence.............................................................................. 50 3.6.3 Norms of Convergence Error ............................................................................................. 50 3.6.4 Case Study – 2D Flow Over a Hill ....................................................................................... 51 3.7 Computer Round off Errors ............................................................................................................................ 53 3.8 Truncation Errors ............................................................................................................................................... 53 3.9 Code Errors ............................................................................................................................................................ 54 3.10 Benchmarking & Inter-Code Issues ...................................................................................................... 54 3.10.1 Case Study 1 – Results of M6 Wing using NASA Codes of the Same Grid ......................... 56 3.10.2 Case Study 2 - Grid Convergence for 3D Benchmark Turbulent Flows ............................. 56 3.10.2.1 Subsonic Flow around a Hemisphere Cylinder ........................................................ 56 3.10.2.2 Geometry, Flow Parameters, and Boundary Conditions......................................... 57 3.10.2.3 Results for Hemisphere Cylinder ............................................................................. 58 3.10.2.4 Forces and Pitching Moment .................................................................................. 58 3.10.2.5 Fine Grid Surface Pressure, Skin Friction, and off-body variation .......................... 60 3.10.2.6 Effect of Grid Refinement on Surface Pressure and Skin Friction ........................... 61 3.10.2.7 Transonic Flow Around an M6 Wing ....................................................................... 62 3.10.2.8 Geometry, Flow Parameters and Boundary Conditions .......................................... 62 3.10.2.9 Grids for M6 Wing ................................................................................................... 62 3.10.2.10 Results for M6 Wing ................................................................................................ 63 3.10.2.11 Concluding Remarks ................................................................................................ 65 3.11 Usage Errors.................................................................................................................................................... 67 3.12 What to trust and what not to? ............................................................................................................... 67 3.13 Verification and Validation for Computational Simulation ......................................................... 68

4

CFD in Biomedical Applications ........................................................................................... 70

4.1 Literature Survey in Biomedical CFD.......................................................................................................... 71 4.1.1 Cardiovascular Systems ..................................................................................................... 71 4.1.2 Respiratory Systems .......................................................................................................... 72 4.2 Merits and Limitations of Biomedical Applications in CFD ............................................................... 74 4.3 Hemodynamic Flow Modeling ....................................................................................................................... 76 4.4 Boundary Conditions ......................................................................................................................................... 76 4.5 Structural Deformation Models..................................................................................................................... 77 4.6 Fluid-Structure Interaction Techniques .................................................................................................... 77 4.7 Future of CFD in Biomedical Engineering ................................................................................................. 78 4.8 Case Study 1 - Modeling Fluid-Structure Interaction in a Heart Valve ......................................... 79 Background ...................................................................................................................... 79 Advancing Heart Valve Research via Simulation........................................................... 79 Modeling the Opening and Closing of a Heart Valve in COMSOL Multiphysics ........... 79 Simulation Results for Fluid-Structure Interaction in a Heart Valve ................. 80 Improving the Design of Medical Devices with FSI Modeling ...................................... 80

4

4.9 Case Study 2 – CFD Simulation of Human Carotid Artery Bifurcation based on Anatomy and Volumetric Blood Flow Rate Measured with MRI .............................................................................................. 81 4.9.1 Approaches ........................................................................................................................ 81 4.9.2 Results and Discussion....................................................................................................... 82 4.10 Case Study 3 - CFD Analysis of the Effect of Plaques in the Left Coronary Artery ............. 83 4.10.1 Patient Data Selection for Generation of Left Coronary Artery Model ............................. 84 4.10.2 Realistic Plaques Modelling ............................................................................................... 85 4.10.3 Generation of Computational Models............................................................................... 85 4.10.4 Application of Physiological Parameters ........................................................................... 86 4.10.5 Performance of Computational Hemodynamic Analysis................................................... 86 4.10.6 CFD Results of the Left Coronary Artery............................................................................ 87 4.10.6.1 Cutting Plane Visualization ...................................................................................... 87 4.10.6.2 Wall Shear Stress (WSS) Comparisons .................................................................... 88 4.10.7 Discussion .......................................................................................................................... 89 4.10.8 Limitation ........................................................................................................................... 91

5

Mesh Free Methods for CFD ................................................................................................... 92

5.1 Smooth Particle Hydrodynamics (SPH) ..................................................................................................... 92 5.1.1 Mesh free Local Petrov-Galerkin ....................................................................................... 93 5.1.2 Mesh free Methods Based on Radial Basis Functions ....................................................... 94 5.1.3 Finite Point Methods ......................................................................................................... 94 5.1.4 Meshless Boundary Schemes ............................................................................................ 95 5.2 Solution Procedure for Mesh free Methods ............................................................................................ 95 5.2.1 Domain representation ..................................................................................................... 95 5.2.2 Function Approximation .................................................................................................... 95 5.2.3 Formation of System Equations ........................................................................................ 96 5.2.4 Solving the Global Equations ............................................................................................. 96 5.3 Method of Smooth Particle Hydrodynamics (SPH) ............................................................................... 96 5.3.1 Formulation ....................................................................................................................... 96 5.3.2 Smoothing Kernels............................................................................................................. 97 5.3.3 Updating of Smoothing Length h....................................................................................... 99 5.3.3.1 Constant .................................................................................................................. 99 5.3.3.2 Variable.................................................................................................................... 99 5.3.4 Boundary Treatment ......................................................................................................... 99 5.3.5 Virtual Particles.................................................................................................................. 99 5.3.6 Ghost Particles ................................................................................................................. 100 5.3.7 Summery and Recap ........................................................................................................ 100 5.3.8 Case Study 1 - Lid Driven Cavity Problem ........................................................................ 100 5.3.9 Case Study 2 - Two-dimensional Convection–Diffusion Problem ................................... 101 5.4 RKPM Method .................................................................................................................................................... 101 5.5 Lagrangian Description of Fluid Dynamics Using SPH ..................................................................... 102 5.5.1 Default Kernel .................................................................................................................. 102 5.5.2 Numerical Time Integration............................................................................................. 104 5.5.2.1 The Implicit Euler Scheme ..................................................................................... 104 5.5.2.2 The Verlet Scheme ................................................................................................ 104 5.5.2.3 The Leap-Frog Scheme .......................................................................................... 104 5.5.3 Collision Handling ............................................................................................................ 105 5.5.4 Case Study 1 – Comparison of Weakly Compressible and Incompressible SPH .............. 105

5

5.5.4.1 Formulation of Problem ........................................................................................ 105 5.5.4.2 Results ................................................................................................................... 106 5.5.5 Case Study 2 - Dam Break Water Flow using Lagrangian Description ............................. 106 5.5.6 Case Study 3 - Dam Break using MLPG-RBF and Shallow Water Equations .................... 107 5.5.7 Case Study 4 - SPH Method for Evaporating Multiphase Flows ...................................... 108 5.5.7.1 Basic Formulations of the SPH Method................................................................. 109 5.5.7.2 Evaporation of a Static Drop ................................................................................. 109 5.5.7.3 Evaporation of a Dynamic Drop Impacting on a Hot Surface ................................ 109 5.5.7.4 Concluding Remarks .............................................................................................. 110

6

CFD Applications in Other Areas ...................................................................................... 112

6.1 Food Processing ................................................................................................................................................ 112 6.1.1 Drying............................................................................................................................... 112 6.1.2 Sterilization ...................................................................................................................... 113 6.1.3 Mixing .............................................................................................................................. 114 6.1.4 Refrigeration .................................................................................................................... 115 6.1.5 Crystallization .................................................................................................................. 115 6.1.6 Pasteurization .................................................................................................................. 115 6.2 CFD in Semiconductor Industry ................................................................................................................. 116 6.2.1 Brief Description of Semiconductor Devices ................................................................... 116 6.2.2 Thermal Management in Semiconductors ...................................................................... 117 6.2.3 Can You Really Fry an Egg on a CPU?............................................................................... 118 6.3 Magneto-Hydro-Dynamics (MHD) ............................................................................................................ 118 6.3.1 MHD Equations ................................................................................................................ 119 6.3.2 Case Study - Dynamics of a Q2D Wake Behind a Cylinder in Presence of MHD Environment ................................................................................................................................... 120 6.3.2.1 Numerical Method and Geometry ........................................................................ 121 6.3.2.2 Result and Discussion ............................................................................................ 122 6.4 Maxwell’s Equations - Electromagnetic Waves ................................................................................... 122 6.4.1 Historical Perspective ...................................................................................................... 122 6.4.2 The Finite-Difference Time-Domain Method (FDTD) ...................................................... 122 6.4.3 Strengths of FDTD Modeling............................................................................................ 123 6.4.4 Weaknesses of FDTD Modeling ....................................................................................... 123 6.4.5 Case Study - 1D Maxwell Equation .................................................................................. 124 6.4.5.1 Boundary Conditions ............................................................................................. 125 6.5 Mechanisms of Nanofluids ........................................................................................................................... 125 6.5.1 What is a Nanofluid? ....................................................................................................... 125 6.5.2 What Applications are Suitable for Nanofluids? ............................................................. 126 6.5.3 What are the Advantages and Disadvantages of Nanofluids? ........................................ 127 6.5.4 CFD Techniques for Nanofluid Flow Solution .................................................................. 127 6.5.5 Macroscale Based Techniques......................................................................................... 128 6.5.5.1 Finite Differencing Method ................................................................................... 128 6.5.5.2 Finite Volume Method .......................................................................................... 128 6.5.5.2.1 Common FVM Solvers ...................................................................................... 129 6.5.5.3 Finite Element Method.......................................................................................... 129 6.5.6 Dynamics of Nanoparticle Motion in a Liquid ................................................................. 130 6.5.7 Other Macroscale Based Techniques .............................................................................. 131 6.5.7.1 Control Volume Finite Element Method ............................................................... 131

6

6.5.7.2 Boundary Element Method ................................................................................... 132 6.5.7.3 Spectral Method .................................................................................................... 132 6.5.7.4 Meshless Methods ................................................................................................ 132 6.5.7.5 Lattice Boltzmann Method .................................................................................... 132 6.5.7.6 Dissipative Particle Dynamics Method .................................................................. 133 6.5.8 Microscale Based Techniques (Molecular Dynamic Simulation) ..................................... 133 6.5.9 Flow Properties in Nanofluids Environment.................................................................... 133 6.5.9.1 Density and Specific Heat ...................................................................................... 134 6.5.9.2 Thermal Conductivity ............................................................................................ 134 6.5.9.3 Viscosity ................................................................................................................. 134 6.5.9.4 Heat Transfer Coefficient ...................................................................................... 135 6.5.10 Numerical Simulation ...................................................................................................... 136

7

Modern Computer Architectures ..................................................................................... 139

7.1 Background......................................................................................................................................................... 139 7.2 Memory Technology ....................................................................................................................................... 140 7.2.1 Memory Access Time....................................................................................................... 140 7.2.2 Memory Access Patterns ................................................................................................. 140 7.2.2.1 Loop Interchange to Ease Memory Access Patterns ............................................. 141 7.2.3 Virtual Memory ............................................................................................................... 141 7.3 Registers .............................................................................................................................................................. 142 7.4 Caches ................................................................................................................................................................... 142 7.4.1 Cache Organization.......................................................................................................... 144 7.4.1.1 Direct-Mapped Cache............................................................................................ 145 7.4.1.2 Fully Associative Cache .......................................................................................... 146 7.4.1.3 Set-Associative Cache ............................................................................................ 146 7.4.1.4 Instruction Cache................................................................................................... 147 7.5 Timing a Program ............................................................................................................................................ 147 7.5.1 Timing a Portion of the Program ..................................................................................... 149 7.5.2 Getting Time Information ................................................................................................ 149 7.6 Subroutine Profiling ........................................................................................................................................ 150 7.7 Loop Optimizations ......................................................................................................................................... 152 7.7.1 Operation Counting ......................................................................................................... 152 7.7.2 Basic Loop Un-Rolling ...................................................................................................... 154 7.7.3 Loops with Low Trip Counts ............................................................................................ 155 7.7.4 Fat Loops.......................................................................................................................... 155 7.7.5 Loops Containing Procedure Calls ................................................................................... 156 7.7.6 Loops with Branches ........................................................................................................ 156 7.7.7 Nested Loops ................................................................................................................... 156 7.7.8 Outer Loop Un-Rolling ..................................................................................................... 157 7.7.9 Loop Interchange to Move Computations to the Center ................................................ 158 7.8 Matrix Multiplication ...................................................................................................................................... 158 7.8.1 Matrix Optimization ........................................................................................................ 159 7.8.2 Blocking to Ease Memory Access Patterns ...................................................................... 160 7.9 Shared-Memory Parallel Processors ........................................................................................................ 160 7.9.1 Dependencies .................................................................................................................. 161 7.9.1.1 Control Dependencies ........................................................................................... 162 7.9.1.2 Data Dependencies ............................................................................................... 163

7

7.9.2 Forming a Flow Graph ..................................................................................................... 163 7.9.2.1 Loop Dependencies ............................................................................................. 164 7.9.2.2 Loop-Carried Dependencies .................................................................................. 165 7.9.2.3 Flow Dependencies ............................................................................................... 166 7.9.2.4 Output Dependencies ........................................................................................... 167 7.9.2.5 Dependencies Within an Iteration ........................................................................ 167 7.10 Pointer Ambiguity in C............................................................................................................................. 168 7.11 Some Preliminary Concepts in Quantum Computation ............................................................. 170 7.11.1 What is Quantum Computing? ........................................................................................ 170 7.11.2 How Does Quantum Computers Work? .......................................................................... 170 7.11.3 Classical vs. Quantum Computing ................................................................................... 171 7.11.4 Qubits and Power of a Quantum Computer.................................................................... 171 7.11.5 Quantum Algorithms: Programming a Quantum Computer ........................................... 171 7.11.5.1 Could Quantum Computing Methods Improve Iterative Calculations in CFD?..... 172 7.11.5.2 Quantum Speedup for Turbulent Combustion Simulations .................................. 172 7.11.5.3 Large Eddy Simulation (LES) and Filtered Density Function (FDF) ........................ 173

8

Parallel Processing and HPC .............................................................................................. 174

8.1 Classification of Parallel Computers Architecture.............................................................................. 174 8.2 Shared Memory Multi-Processor ............................................................................................................... 176 8.3 Distributed Memory Multi-Computer ..................................................................................................... 176 8.4 Efficiency and Scalability .............................................................................................................................. 177 8.4.1 Weak vs. Strong Scaling ................................................................................................... 179 8.4.2 Scalability vs. Performance .............................................................................................. 179 8.4.3 Load Balancing ................................................................................................................. 180 8.5 Performance of CFD Codes ........................................................................................................................... 180 8.5.1 CFD for Next Generation High Performance Computing ................................................ 181 8.5.2 Hardware Consideration and CPU vs. GPU Technology .................................................. 181 8.5.2.1 Case Study 1 – 2D Laplace Equation...................................................................... 182 8.5.2.2 Results ................................................................................................................... 182 8.5.2.3 8.5.3 Future Work – Heterogeneous Computing .................................................. 182 8.5.3 Case Study 2 - Unstructured Grid Based CFD Solvers on Modern Graphics Hardware ... 183 8.5.3.1 Background and Literature Survey ........................................................................ 183 8.5.3.2 Implementation on Graphics Hardware ................................................................ 184 8.5.3.3 Test Cases .............................................................................................................. 184 8.6 Software Consideration and Message Passing Interface (MPI) .................................................... 186 8.7 Cloud Computing: Definition and Features ........................................................................................... 187 8.7.1 “Infrastructure as a Service” (IaaS).................................................................................. 187 8.7.2 “Platform as a Service” (PaaS) ......................................................................................... 187 8.7.3 The last model is “Software as a Service” (SaaS)............................................................. 187 8.8 High Performance Computing (HPC) ....................................................................................................... 187 8.8.1 Real Application Performance ......................................................................................... 189 8.8.2 Choosing the right interconnect ...................................................................................... 189 8.9 Grid Computing vs. HPC ................................................................................................................................ 189 8.10 HPC vs. HSC ................................................................................................................................................ 190 8.11 The Moral of the Story ............................................................................................................................. 190 8.12 HPC vs. Parallel Computing.................................................................................................................... 190 8.13 HPC vs. HTC ................................................................................................................................................ 191

8

9

CFD and HPC Trends Forecasted for 2030 .................................................................... 192

9.1 Comparison of Semiconductor fabrication sizes in HPC .................................................................. 192 9.2 Current Status of CFD ..................................................................................................................................... 193 9.2.1 Conceptual Design ........................................................................................................... 193 9.2.2 Preliminary/Detailed Design ............................................................................................ 193 9.2.3 Product Validation and Certification ............................................................................... 194 9.2.4 CFD usage of High Performance Computing (HPC) ......................................................... 194 9.2.5 Turbulence Modeling....................................................................................................... 194 9.2.6 Process Automation ........................................................................................................ 195 9.2.7 Solution Uncertainty and Robustness ............................................................................. 195 9.2.8 Multidisciplinary Analysis and Optimization (MDAO) ..................................................... 196 9.3 Vision of CFD in 2030 as anticipated by NASA .................................................................................... 196 9.3.1 Technology Roadmap to achieve GC Challenge .............................................................. 198 9.3.1.1 High Performance Computing (HPC) ..................................................................... 198 9.3.1.2 Physical Modeling .................................................................................................. 199 9.3.1.3 Numerical Algorithms ............................................................................................ 199 9.3.1.4 Uncertainty Quantification (UQ) ........................................................................... 201 9.3.1.5 Geometry and Grid Generation............................................................................. 201 9.3.1.6 Knowledge Extraction............................................................................................ 202 9.3.1.7 Multidisciplinary Design and Optimization ........................................................... 203 9.3.2 Recommendations ........................................................................................................... 203 9.4 HPC Envisioned by Department of Energy (DOE) .............................................................................. 205 9.4.1 What is Exascale Computing? .......................................................................................... 205 9.4.2 Why Exascale? ................................................................................................................. 205 9.4.3 Range of Applications may be Transformed by Going to the Exascale ........................... 205 9.4.3.1 Aerospace, Airframes and Jet Turbines ................................................................. 206 9.4.3.2 Combustion ........................................................................................................... 208 9.4.3.3 Climate Modeling .................................................................................................. 209 9.4.3.4 Computational Biology .......................................................................................... 209 9.4.3.5 Materials Science................................................................................................... 210 9.4.3.6 Nuclear Engineering .............................................................................................. 211 9.4.3.7 Others Disciplines .................................................................................................. 212 9.4.4 Challenges in Going to the Exascale ................................................................................ 212 9.4.4.1 The Hardware Challenges...................................................................................... 213 9.4.4.2 The Applied Mathematics Challenges ................................................................... 214 9.4.4.3 Mathematical Modeling ........................................................................................ 214 9.4.4.4 Numerical Algorithms ............................................................................................ 215 9.4.4.5 The Algorithmic Challenges ................................................................................... 216 9.4.4.6 Computer Science Challenges ............................................................................... 217 9.4.4.7 Educational Challenges.......................................................................................... 217

10

Artificial Intelligence in CFD .............................................................................................. 219

10.1 Background .................................................................................................................................................. 219 10.2 Machine Learning ...................................................................................................................................... 219 10.2.1 Difference Between Artificial Intelligence and Machine Learning .................................. 220 10.3 Deep Learning ............................................................................................................................................. 220 10.4 Types of Problems and Tasks ............................................................................................................... 221 10.4.1 Supervised Learning......................................................................................................... 221

9

10.4.2 Unsupervised Learning .................................................................................................... 221 10.4.3 Reinforcement Learning .................................................................................................. 221 10.5 List of Common Machine Learning Algorithms ............................................................................. 222 10.5.1 Linear Regression............................................................................................................. 222 10.5.2 Logistic Regression .......................................................................................................... 223 10.5.3 Decision Tree ................................................................................................................... 223 10.5.4 Artificial Neural Networks (ANNs) ................................................................................... 224 10.5.4.1 Case Study - Prediction & Comparison of the Maximal Wall Shear Stress (MWSS) for Carotid Artery Bifurcation .................................................................................................... 224 10.6 Machine Learning in Fluid Dynamics ................................................................................................ 226 10.6.1 Motivation and Objectives .............................................................................................. 226 10.6.2 Design and Optimization Issue ........................................................................................ 226 10.6.3 Accomplishments ............................................................................................................ 227 10.6.4 Field Inversion and Machine Learning in Support of Data Driven Environment ............. 228 10.6.4.1 Artificial Neural Networks (ANNs) ......................................................................... 228 10.6.5 The POD as Linear Artificial Neural Network (LANN) ...................................................... 229 10.6.5.1 POD and Nonlinear ANN........................................................................................ 231 10.6.6 Overview of ANNs in Turbulence Applications ................................................................ 231 10.6.7 The Future of ANNs for Fluids Modelling ........................................................................ 232 10.6.8 Classification of Machine Learning (ML) Frameworks for Data-Driven Thermal Fluid Models as Envisioned by [Chang & Dinh] ....................................................................................... 233 10.6.8.1 Machine Learning (ML) for Thermal Fluid Simulation ........................................... 233 10.6.8.2 Thermal Fluid Data ................................................................................................ 235 10.6.8.3 Machine Learning Frameworks for Data-Driven Thermal Fluid Models ............... 235 10.6.8.4 Criteria for Classifying ML Frameworks for Thermal Fluid Simulation .................. 236 10.6.8.4.1 Criterion 1- Is PDE Involved in Thermal Fluid Simulation? ............................... 236 10.6.8.4.2 Criterion 2 - Is the Form of PDEs Given? .......................................................... 236 10.6.8.4.3 Criterion 3 - Is the PDE Involved in the Training of Closure Relations? ........... 236 10.6.8.4.4 Criterion 4 - Is a Scale Separation Assumption Required for the Model Development? ...................................................................................................................... 237 10.6.8.5 Type-I : Physics-Separated Machine Learning (PSML) .......................................... 237 10.6.8.5.1 Element 1.......................................................................................................... 237 10.6.8.5.2 Element 2.......................................................................................................... 237 10.6.8.5.3 Element 3.......................................................................................................... 237 10.6.8.5.4 Element 4.......................................................................................................... 237 10.6.8.5.5 Element 5.......................................................................................................... 237 10.6.8.5.6 Element 6.......................................................................................................... 237 10.6.8.5.7 Element 7.......................................................................................................... 237 10.6.8.6 Type-II: Physics-Evaluated Machine Learning (PEML) .......................................... 239 10.6.8.7 Type III - Physics-Integrated Machine Learning (PIML) ......................................... 239 10.6.8.8 Type IV - Physics-Recovered Machine Learning (PRML) ....................................... 239 10.6.8.9 Type V - Physics-Discovered Machine Learning (PDML) ....................................... 240 10.6.8.10 Knowledge and Data Requirements for ML Frameworks in Thermal Fluid Simulation 240 10.6.8.11 Case Study - Heat Conduction Investigation by Type I ML Framework ................ 241 10.6.8.11.1 Problem Formulation...................................................................................... 241 10.6.8.11.2 Manufacturing IET Data .................................................................................. 242 10.6.8.11.3 Manufacturing SET Data ................................................................................. 243

10

10.6.8.11.4 Implementation of the Heat Conduction by Type I ML Frameworks ............ 243 10.6.8.11.5 CNN-Based Thermal Conductivity Model ....................................................... 244 10.6.8.11.6 Closing Remarks.............................................................................................. 245

11

Appendix A ............................................................................................................................... 246

11.1

Routine for Inverse Distance Weighted Interpolation (Shepard’s Method) ..................... 246

List of Tables Table 2.1 Main Parameters of the Test Stages (P1) and (P2) ................................................................... 28 Table 3.1 Discretization Error for 2D Burger’s Equation ........................................................................... 46 Table 3.2 NASA Code Comparisons for Surface Forces in M6 Wing....................................................... 56 Table 3.3 Statistics of four finest grids for hemisphere cylinder grid families (Courtesy of [Diskin et al.]) .................................................................................................................................................................... 57 Table 3.4 Hemisphere Cylinder: Variation of Aerodynamic Coefficients on L1 Grids – (Courtesy of [Diskin et al.]) ............................................................................................................................................................... 59 Table 3.5 Statistics of Grids for OM6 Wing Grid Families........................................................................... 63 Table 3.6 Variations of Aerodynamic Coefficients - (Courtesy of [Diskin et al.]) ............................. 64 Table 6.1 Nonculture used in this study ......................................................................................................... 133 Table 7.1 Memory Access Speed on a DEC Alpha........................................................................................ 142 Table 9.1 Three Order of Magnitude Jump .................................................................................................... 213 Table 9.2 Potential Exascale Computer Design for 2018 and its relationship to current HPC designs (DOE) ................................................................................................................................................................ 213 Table 10.1 Results of Different Methods ........................................................................................................ 225 Table 10.2 Criteria for the ML Framework Classification - (Courtesy of Chang & Dinh) ........... 236 Table 10.3 Parameter Sets for the Thermal Conductivity Model - (Courtesy of Chang & Dinh) .............................................................................................................................................................................................. 242 Table 10.4 Summary of IET Training and Validating Data Sets - (Courtesy of Chang & Dinh) 242 Table 10.5 Summary of SET Training Datasets - (Courtesy of Chang & Dinh) .............................. 243

List of Figures Figure 1.1 Active on-going Research Area in CFD ......................................................................................... 14 Figure 1.2 Integrated Simulation for Nuclear Engineering........................................................................ 16 Figure 2.1 Interpolation on a matrix manifold................................................................................................ 18 Figure 2.2 Comparison of the drag coefficient obtained with the High Fidelity (HF) and ROM simulations ......................................................................................................................................................................... 24 Figure 2.3 Comparison between velocity and pressure High Fidelity (HF)-ROM ............................ 25 Figure 2.4 1D vs 3D Analysis .................................................................................................................................. 27 Figure 2.5 Turbine Flow Design Process ........................................................................................................... 28 Figure 2.6 Profile of Blades ..................................................................................................................................... 29 Figure 2.7 Typical Cooling System Network for Airflow Rate .................................................................. 30 Figure 3.1 Schematic Composition of Uncertainty Estimates in a Diffuser ......................................... 35 Figure 3.2 Variation in CP at y/b = 0.4 and 0.65 along the ONERA M6 wing ..................................... 36 Figure 3.3 Comparison of (a) the Uncertainty Estimates, versus (b) Shock on the ......................... 36 Figure 3.4 Variation in Mach number, temperature and pressure along the centerline of the jet efflux x/Djet ......................................................................................................................................................................... 37 Figure 3.5 Pressure Coefficient at 20% Chord Length using Different Turbulence Model .......... 38 Figure 3.6 Effects of Different Turbulence Models in a Steep Obstacle ................................................ 39 Figure 3.7 Inviscid stencil with 1st order cells in red and 2nd order cells in green ........................... 40 Figure 3.8 Viscous stencil with viscous cells in blue and 2nd order cells in green............................ 40

11

Figure 3.9 Effect of Pe Number in balancing Diffusive and Convective Flows................................... 41 Figure 3.10 Effect of 1st and 2nd Order Differencing Scheme in Error ................................................... 41 Figure 3.11 Effects of mesh density on solution domain ............................................................................ 42 Figure 3.12 Domain Topology (O-Type, C-Type, and H-Type; from left to right)............................. 43 Figure 3.13 Contours of Total Estimated Discretization Error in Density .......................................... 45 Figure 3.14 Exact error, Estimated error scheme for viscous Burgers’ equation (Courtesy of Yan and Ollivier-Gooch) ................................................................................................................................................ 47 Figure 3.15 Temporal Discretization Criteria ................................................................................................. 47 Figure 3.16 Relative discretization error for the transitional flow over a sharp cone ................... 48 Figure 3.17 Effect of CFL Number on Convergence of 1D Wave Equation .......................................... 49 Figure 3.18 Estimated Iteration Error of U1 for Different Level of Tolerance criteria et............... 52 Figure 3.19 Global View of and Boundary Conditions (Courtesy of [Diskin et al.]) ........................ 57 Figure 3.20 Global View of Hemisphere Cylinder Pressure Contours using L1 grid at surfaces y = 0 (left) and x = 6 (right); (Courtesy of [Diskin et al.]) ................................................................................... 58 Figure 3.21 Grid Convergence of Aerodynamic Forces for Hemisphere Cylinder (Courtesy of [Diskin et al.]) .................................................................................................................................................................... 60 Figure 3.22 Global View of Surface Pressure and Skin Friction at symmetry plane (y = 0) for Hemisphere Cylinder – (Courtesy of [Diskin et al.]) ......................................................................................... 61 Figure 3.23 M6 wing: pressure contours computed by USM3D on family 4 prism/hex L1 grid (Courtesy of [Diskin et al.]) ......................................................................................................................................... 63 Figure 3.24 M6 Grid Convergence of Aerodynamic Forces CL, CD ........................................................... 64 Figure 3.25 M6 Grid Convergence of Pitching Moment............................................................................... 65 Figure 3.26 M6 section 1 (η = x/c = 0.2) View of leeside Pressure Grid Refinement - (Courtesy of [Diskin et al.]) ............................................................................................................................................................... 66 Figure 4.1 Example of CFD simulations in cardiovascular and respiratory systems ...................... 70 Figure 4.2 CFD Model Construction for Biomedical Application ............................................................. 75 Figure 4.3 Schematic of a Heart. Image by Wapcaplet (Licensed via Wikimedia Commons)...... 79 Figure 4.4 FSI Model of a Heart Valve Opening (left) and Closing (right) ........................................... 80 Figure 4.5 Axial velocity and Time Average ..................................................................................................... 82 Figure 4.6 Anatomic Model for the Patient with Carotid Artery Plaque .............................................. 83 Figure 4.7 3D CT visualization of a normal left coronary artery with coronary artery disease . 84 Figure 4.8 Plaque distribution in left coronary artery Model ................................................................... 85 Figure 4.9 The EPL Posterior View at left Coronary Artery ...................................................................... 87 Figure 4.10 Flow velocity observed in pre and post plaque simulated models ................................ 88 Figure 4.11 Cross-sectional views of A–E at the left main stem .............................................................. 89 Figure 4.12 Comparison of WSS between non-Newtonian and Newtonian Models Observed in Coronary Artery with Presence of Plaques........................................................................................................... 90 Figure 5.1 Domain representation ....................................................................................................................... 95 Figure 5.2 Different type of Support domains ................................................................................................. 96 Figure 5.3 1-D SPH Characterization................................................................................................................... 96 Figure 5.4 The choice of Different Smooth Kernel in 1D (h=1) ................................................................ 98 Figure 5.5 Ghost Particles, Velocities are formed Symmetrically (slip wall) ..................................... 99 Figure 5.6 Virtual Particles ...................................................................................................................................... 99 Figure 5.7 Example of a 1D task, particle j is Situated in the Near Boundary Area ......................... 99 Figure 5.8 Comparison with FDM with SPH for Lid Driven Cavity ...................................................... 100 Figure 5.9 The diagram of global domain Ω, local support domain Ωs of point xs, global points x and local point xi ........................................................................................................................................................... 101 Figure 5.10 Lagrange particle-based fluid structure in 2D ..................................................................... 102 Figure 5.11 The default kernel and its derivatives in one dimension for h=1 ................................ 103 Figure 5.12 The leap-frog mechanism ............................................................................................................. 104

12

Figure 5.13 Comparison of ISPH (upper), FEM (middle) and WCSPH (lower) velocity contours for the angle of attack of 15 degrees at Re = 570 (Courtesy of Shadloo105) ........................................ 106 Figure 5.14 Dam-Break Flow of water ............................................................................................................ 107 Figure 5.15 Geometry and Water surface profile of the 2D dam-break problem at t =7.2 s..... 107 Figure 5.16 Snapshots of the Evaporating Drop at different times using SPH ............................... 109 Figure 5.17 Evolution of Dynamic Drop impact on a hot surface using SPH ................................... 110 Figure 6.1 Illustrates the various classes of conductors .......................................................................... 116 Figure 6.2 Modern Semiconductor ................................................................................................................... 117 Figure 6.3 Thermal Management of Semiconductor (courtesy of Mentor CFD) ............................ 117 Figure 6.4 An Example of an Egg Frying on a CPU ...................................................................................... 118 Figure 6.5 Right Hand Rule for MHD ................................................................................................................ 120 Figure 6.6 Schematic diagram of numerical domain ................................................................................. 121 Figure 6.7 Contour plots of vorticity snapshot at Red = 160 and at Hartmann number as indicated ........................................................................................................................................................................... 122 Figure 6.8 Illustration of a Standard Cartesian Yee cell used for FDTD for Electric and Magnetic Field.................................................................................................................................................................................... 125 Figure 6.9 Nanofluid preparation is not just mixing nanoparticles and a liquid but special physical and chemical techniques are needed to have a stable nanofluid ........................................... 126 Figure 6.10 Some applications of nanofluids in a glance ......................................................................... 127 Figure 6.11 Different CFD Techniques for Nano-Fluids ........................................................................... 128 Figure 6.12 (a) Flow in a wavy Chuime solved by ANSYS FLUENT[17] (b) flow in a parabolic trough collector solved by ANSYS CFX[22] (figures are reprinted with permission from publisher) ........................................................................................................................................................................ 129 Figure 6.13 (a) Flow in a shell and tube regenerative type latent heat storage system solved by COMSOL Multiphysics (b) Analysis of a Direct Solar Absorption Collector by FlexPDE[27] (Figure reprinted with permission from publisher) ...................................................................................................... 130 Figure 6.14 Forces acting on a general particle suspended in a fluid flow by different sources .............................................................................................................................................................................................. 131 Figure 7.1 Contributions from other disciplines to CFD .......................................................................... 139 Figure 7.2 Cache Lines can come from Different Parts of Memory ..................................................... 143 Figure 7.3 Many memory addresses map to the same cache line ........................................................ 145 Figure 7.4 Two-Way Set-Associative Cache .................................................................................................. 147 Figure 7.5 Sharp Profiling (right) vs. Flat Profiling (right) ..................................................................... 151 Figure 7.6 (a) Control Dependency; (b) A section of your program; (c) Expensive Operation Moved so that it's Rarely Executed ....................................................................................................................... 162 Figure 7.7 Types of Data Dependencies .......................................................................................................... 163 Figure 7.8 Flow Graph for Data Flow Analysis ............................................................................................. 164 Figure 7.9 Flow Graph including a loop ......................................................................................................... 165 Figure 7.10 Difference Processing Between Classical and Quantum Computer ........................... 170 Figure 7.11 The Bloch Sphere is a Representation of a Qubit, the Fundamental Building Block of Quantum Computers .............................................................................................................................................. 171 Figure 8.1 Multi-Processor vs. Multi-Computer .......................................................................................... 175 Figure 8.2 Shared Memory Multi-Processor ................................................................................................. 176 Figure 8.3 Distributed Memory Multi-Processor ........................................................................................ 177 Figure 8.4 Amdahl's Law ....................................................................................................................................... 178 Figure 8.5 Example of Strong Scalability ........................................................................................................ 179 Figure 8.6 Architecture differences between CPU and GPU ................................................................... 181 Figure 8.7 Results for V-Cycle Multigrid ......................................................................................................... 182 Figure 8.8 Heterogeneous Computing using CPUs and GPUs ................................................................ 182

13

Figure 8.9 Pressures at the Surface and Plane for the NACA 00012 (Left) and at the Surface for the Missile (Right) ........................................................................................................................................................ 184 Figure 8.10 Running Times in double Precision Per Element Per Iteration for the NACA0012 (top) and Missile (bottom) ....................................................................................................................................... 185 Figure 8.11 Maui High Performance Computing Center .......................................................................... 188 Figure 8.12 Performance rate of two HPC for benchmark CFD Analysis .......................................... 189 Figure 8.13 Scope of HPC and HSC .................................................................................................................... 190 Figure 9.1 Changing Predictions About Semiconductor Sizes ............................................................... 192 Figure 9.2 Proposed New Computational Sciences Program Structure ............................................ 204 Figure 9.3 Computer speed and memory requirements for the Grand Challenge ........................ 206 Figure 9.4 A supersonic Jet Engine Nozzle Rapidly Accelerates High-Pressure Gas into the Atmosphere..................................................................................................................................................................... 207 Figure 9.5 Detail View of 9-Billion Atom Molecular Dynamics Simulation Instability ................ 211 Figure 10.1 Scope of Artificial Intelligence (Courtesy of Hackerearth Blog)................................... 219 Figure 10.2 Schematics of AI, Machine Learning and Deep Learning................................................ 220 Figure 10.3 Linear Regression ............................................................................................................................ 222 Figure 10.4 Decision Tree ..................................................................................................................................... 223 Figure 10.5 Artificial Neural Network (ANN) ............................................................................................... 224 Figure 10.6 Maximal Wall Shear Stress (MWSS) Value for Carotid Artery Bifurcation ............... 225 Figure 10.7 Calibration Cases for Off Line Data ........................................................................................... 228 Figure 10.8 Network Diagram for a feed-forward NN with three inputs and one output ........ 229 Figure 10.9 Comparison of linear POD (top) and Neural Networks (bottom) ............................... 231 Figure 10.10 Skin Friction Coefficient for Onera M6 match to within 2% ...................................... 232 Figure 10.11 Workflow of Employing ML methods for Developing Thermal fluid closures – (Courtesy of Chang & Dinh)...................................................................................................................................... 234 Figure 10.12 Hierarchy of Thermal Fluid Data - (Courtesy of Chang & Dinh) ................................ 235 Figure 10.13 Overview of Type I ML Framework with a Scale Separation Assumption (Courtesy of Chang & Dinh)...................................................................................................................................... 238 Figure 10.14 Domain of Various ML Frameworks where L, M, and H Denote Low, Medium, and High - (Courtesy of Chang & Dinh) ........................................................................................................................ 240 Figure 10.15 Schematic of integral effects tests (IETs) for measuring Temperature fields (Courtesy of Chang & Dinh)...................................................................................................................................... 242 Figure 10.16 Schematic of Separate Effects Tests (SETs) for Measuring Thermal Conductivity as the Function of Sample’s Mean Temperature - (Courtesy of Chang & Dinh) ................................. 243 Figure 10.17 Architecture of CNN-based thermal conductivity (adopted after LeCun) ............. 244

14

1 Introduction As evident in Figure 1.1 below, there is no shortage of active research area in CFD. Besides the regular on-going research in new algorithms, there are ever expanding of new activities, some mentioned here but not all. Some of the more prominent researches are shown in

Figure 1.1

Active on-going Research Area in CFD

Figure 1.1 and out of them, some defined below.

1.1 Computational Predictively plus Verification & Validation This includes relatively well-defined tasks such as verification of the correctness of computer codes and uncertainty quantification as well as more hazy ones like validations of the model being used. As

15

codes become more complex their verification becomes more challenging. Methods such as the Method of Manufactured Solutions are one way. In its simplest form, uncertainty quantification is simply the propagation of uncertainties in parameters, properties and models to the final solution. Although conceptually simple, this is a formidable task both because we need to know all elementary uncertainties and because of the number of computations involved. Other avenues are obtaining the Sensitivity Analysis of solution with respect to design variables of interest. This can be achieved with attaining the 1st order differentials which indicates the max/min of function. Statistical variations certainly has proved its value in many areas, such as quality control in manufacturing and uncertainty quantification is likely to become increasingly more important in the use of simulations in design.

1.2 Multiscale/Multiphysics Multiscale is a broad term that usually means what the user intends it to. In most cases, however, it is used to mean phenomenon where some aspects of the physics that we wish to compute must be described by a different physical model. This can include contact lines in multiphase flow simulations represented by molecular or phase field models, reaction zones, shocks in rarified gases and so on. While we often think of multiscale representing different physical processes, such as continuum and non-continuum descriptions, it also applies to the same physics but modeled in different ways, such as when small drops are modeled as point particles. Numerical challenges include how to blend one description with another.

1.3 Mesh Free Methods for CFD While the generation of meshes has always posed challenges for computational scientists, the problem has become more acute in recent years. While algorithms have seen great advances, mesh generation has lagged behind, creating a computational bottleneck. For industry and government looking to impact current and future products with simulation technology, mesh generation imposes great challenges. Many generation procedures often lack automation, requiring many man-hours, which are becoming far more expensive than computer hardware. More automated methods are less reliable for complex geometry with sharp corners, concavity, or otherwise complex features. Most mesh generation methods to date require a great deal of user expertise to obtain accurate simulation results. Since the application of computational methods to real world problems appears to be paced by mesh generation, alleviating this bottleneck potentially impacts an enormous field of problems1. Meshless methods applied to computational fluid dynamics is a relatively new area of research designed to help alleviate the burden of mesh generation. Despite their recent beginning, there exists no shortage of formulations and algorithms for meshless schemes in the literature. A brief survey of the field reveals varied approaches arising from diverse mathematical backgrounds applied to a wide variety of applications. All meshless schemes attempt to bypass the use of a conventional mesh entirely or in part by discretizing governing partial differential equations on scattered clouds of points or collection of smooth blob of particles. There are two different approaches which are called meshless. One contains methods like surface panel methods, boundary element methods, etc. which do not contain a volume grid. The other types are those which use an arbitrary distribution of points in the computational domain. Particle methods also belong to this category where the particles themselves act as discretization points. The method is called meshless because the points need not form any grid and they do not have to be arranged in any particular manner. The main motivation of meshless methods is that it is much easier to generate a point mesh. The accuracy of grid-based methods depends on the quality of the grid and

1

Aaron Jon Katz, ResearchGate,”Meshless methods for computational fluid dynamics”, January 2009.

16

so you have to ensure orthogonality, or make sure that elements are not highly skewed, while meshless methods are not very much affected by how the points are distributed2. A brief survey of the field reveals varied approaches arising from diverse mathematical backgrounds applied to a wide variety of applications. Sorting and classifying the many meshless methods is no simple task. To add to the confusion, meshless schemes fall under many other names including mesh free, grid free, grid less, generalized finite difference, and Smooth Particle Hydrodynamics (SPH). We try to adapt the mesh free vocabulary here. From the above methods, smooth particle hydrodynamics (SPH) is distinctive in mesh free methods. It is where the fluid mass is lumped into smoothed blobs that are moved using Newton’s second law directly, without an underlying mesh. In SPH the fluid is modeled as a collection of smooth “blobs” or particles3. 1.4 Integrated Simulations of Complex Systems Engineers have long desired to have computational models that describe systems consisting of many coupled components. At the simplest level such simulators model the dynamics of connected rigid bodies, lumped models of chemical and power plants and so on. As computers become more powerful we are seeing growing efforts to attempt much more complex modeling, such as of rockets (the Illinois ASCI center) or a nuclear power plant (CASL), and other DOE research hub funded efforts. Other examples include the Human Body Simulator Project in Japan (lead by S. Takagi) and possibly the recently announced Living Earth Simulator proposal by D. Helbing. Overall there is very limited theoretical basis for how to do the coupling (with some exceptions such as for solid/fluid problems) and that there is considerable room for significant progress. As the ASCI programs, CASL and other effort suggest, this is going to be a very significant area in the future. (See Figure 1.2).

Figure 1.2

2 3

Integrated Simulation for Nuclear Engineering

From CFD Online Forum. Grétar Tryggvason, “Smooth Particle Hydrodynamics”, Lecture Series 2013.

17

18

2 Reduced Order Modeling (ROM) Many modern mathematical models of real-life processes pose challenges when used in numerical simulations, due to complexity and large size (dimension). Model order reduction aims to lower the computational complexity of such problems, for example, in simulations of large-scale dynamical systems and control systems. By a reduction of the model's associated state space dimension or degrees of freedom, an approximation to the original model is computed. This Reduced Order Model (ROM) can then be evaluated with lower accuracy but in significantly less time. Reduced order models (ROM) can be thought of as computationally inexpensive mathematical representations that offer the potential for near real-time analysis. While most ROMs can operate in near real-time, their construction can however be computationally expensive as it requires accumulating a large number of system responses to input excitations. Furthermore, ROM usually lack robustness with respect to parameter changes and therefore must often be rebuilt for each parameter variation. Together, these two issues underline the need for a fast and robust method for adapting pre-computed ROMs to new sets of physical or modeling parameters. However, ROMs and their corresponding Reduced Order Bases (ROB) are quantities that typically belong to nonlinear, matrix manifolds. As such, classical interpolation methods fail, as they are not able to enforce the constraints characterizing Figure 2.1 Interpolation on a matrix manifold those manifolds. The first part of the project consists of designing a suitable interpolation method enforcing those constraints. A schematic representation of the algorithm is shown in Figure 2.1. It relies on identifying the correct manifold for the given application, constructing the appropriate logarithm mapping to move the interpolation data to a tangent space to this manifold where a standard multivariate interpolation algorithm can be applied, and constructing the appropriate exponential mapping to bring back the computed result to the manifold of interest4. The purpose of reduced order models (ROMs) is: • • • • •

taking advantage of redundancies identifying ‘genuine’ degrees of freedom giving low dimensional approximations (few modes) preserving a satisfactory accuracy decreasing the computational resources (time & storage)

Reduced Order Models (ROMs) based on statically non-linear flow solutions, but with a dynamically time linear approach have been developed. Thus unsteady flows that are a small perturbation about a steady flow with shocks and separations are modelled. This makes ROMs ideal for applications such as flutter clearance and aero-servo-elasticity. To generate a ROM about a particular non-linear mean solution, the dynamically time linear response must be extracted from the CFD code. 4

Farhat Research group.

19

2.1 Various Techniques There is a large variety of ROMs in the market. They are also known as surrogate models. A common approach for model order reduction is Projection-Based reduction. The following methods fall into this class: • • • • • • • • • • •

The classic Proper Orthogonal Decomposition (POD) with Galerkin projection. Either singular value decomposition (SVD) or high-order singular value decomposition (HOSVD), possibly combined with interpolation. Reduced Basis Method. Balanced Truncation. Approximate Balancing. Matrix Interpolation. Transfer Function Interpolation. Piecewise Tangential Interpolation. Loewner Framework. Empirical (Cross Gramian)5. Krylov Subspace Methods.

Among those, the application of the POD-Galerkin reduced order modelling for Finite Volume discretization technique is gained more industrial fields acceptance.

2.2 Common Features Shared by Reduced Order Methods (ROM) All reduced bases require the solution of high-fidelity and therefore very expensive discrete state and/or sensitivity equations and/or adjoin equations. The idea is that these expensive calculations can be done off-line before a state simulation or the optimization of the design parameters or feedback control is attempted. Moreover, one hopes that a single reduced basis can be used for several state simulations or in several design or control settings6. All reduced-basis sets are global in nature, i.e., the support the basis functions globally. Therefore, solving the state or sensitivity or adjoin equations with respect to any of the reduced bases requires the solution of dense linear and nonlinear systems. Thus, unless the dimension of a reduced basis is “small,” it cannot be used without some further processing. Unfortunately, in order to obtain meaningful approximations, it is often the case that the use of reduced bases requires the use of a relatively large number of basis functions. However, it is often the case that reduced bases contain “redundant” information in the sense that the dynamics of the state should be well approximated by a set of functions of much lower dimension. The question then arises: how can one extract a reduced basis of smaller dimension that contains all the essential information of a reduced basis of larger dimension? This is where Proper Orthogonal Decompositions (POD) and Cantorial Voronoi Tessellations (CVT) come in and, in this sense, they are reduced-reduced basis methods. Unfortunately, there is no adequate theoretical foundation for reduced-order methods, even in state simulation settings. However, it is certain that without an inexpensive method for reducing the cost of state computations, it is unlikely that the solution of 3D optimization and control problems involving complex systems, e.g., the Naiver-Stokes system, will become routine anytime soon. Thus, it is also certainly true that these methods deserve more study In control theory, the cross Gramian is a Gramian matrix used to determine how controllable and observable a linear system is. 6 John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 5

20

from the computational and theoretical points of view.

2.3 Reduced Basis Methods All reduced-order methods are reduced basis methods. However, there is a class of methods that use Lagrange bases, Hermit bases, Taylor bases, and Snapshot bases (or more precisely, snapshot sets) that have come to be known as Reduced-Basis Methods. 2.3.1 Lagrange Lagrange bases consist of state solutions corresponding to several different values of the parameters (Reynolds number, design parameters, etc.). These solutions are obtained by standard (and expensive) techniques such as finite element or finite volume methods. For example, if one has the design parameters {αj}j=1, J, one obtains an approximate state solutions for n sets of parameter values to form the n-dimensional Lagrange reduced basis7. 2.3.2 Hermit Hermit bases consist of the state variables and the first derivatives of the state variables with respect to parameters (the sensitivities) determined for different values of the parameters. The state and sensitivity approximations are obtained through standard (and expensive) techniques such as finite element or finite volume methods. Thus, again, if one has the design parameters {αj}j =1, J, one chooses M sets of parameter values and then one obtains the corresponding M approximate state solutions and the corresponding MJ sensitivity derivative approximations. The n = M (J + 1) state and sensitivity approximations form the Hermit reduced basis of dimension n. 2.3.3 Taylor Taylor bases consist of the state and derivatives of the state with respect to parameters (sensitivities and higher-order sensitivities) determined for a fixed set of design parameters. The state and derivative approximations are obtained through standard (and expensive) techniques such as finite element or finite volume methods. The Taylor basis may be somewhat complicated to program due to the complexity of the partial differential equations that determine the higher-order sensitivities. In addition, the number of higher-order derivatives grows very rapidly with the number of design parameters, e.g., if one has 10 design parameters, there are 55 different second derivative sensitivities. Thus, the dimension of the Taylor reduced basis grows quickly with the number of parameters and the number of derivatives used. 2.3.4 Snapshot Sets The state of a complex system is determined by parameters that appear in the specification of a mathematical model for the system. Of course, the state of a complex system also depends on the independent variables appearing in the model. Snapshot sets consist of state solutions corresponding to several parameter values and/or evaluated at several values of one or more of the dependent variables. For example, steady-state solutions corresponding to several sets of design parameters or a time-dependent state solution for a fixed set of design parameter values evaluated at several time instants during the evolution process. Or several state solutions corresponding to different sets of parameter values evaluated at several time instants during the evolution process. Snapshot sets are often determined by solving the full, very large-dimensional discretized system obtained via finite volume or finite element discretization. Experimental data have also been used to determine a snapshot set. Snapshot sets often contain “redundant” information; therefore, snapshot sets must John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 7

21

usually be post-processed to remove as much of the redundancy as possible before they can be used for reduced-order modeling. POD and CVT may be viewed as simply different ways to post-process snapshot sets. Since snapshot sets are the underpinning for POD and CVT, we briefly discuss how they are generated in practice. At this time, the generation of snapshot sets is an art and not a science; in fact, it is a rather primitive art. The generation of snapshot sets is an exercise in the design of experiments, e.g., for stationary systems, how does one choose the sets of parameters at which the state (and sensitivities) are to be calculated (using expensive, high-fidelity computations) in order to generate the snapshot set? Clearly, some a priori knowledge about the types of states to be simulated or optimized using the reduced-order model is very useful in this regard. The large body of statistics literature on the design of experiments has not been used in a systematic manner. For time-dependent systems, many (ad hoc) measures have been invoked in the hope that they will lead to good snapshot sets. Timedependent parameters (e.g., in boundary conditions) are used to generate states that are “rich” in transients, even if the state of interest depends only on time-independent parameters. In order to generate even “richer” dynamics, impulsive forcing is commonly used, e.g., starting the evolution impulsively with different strength impulses and introducing impulses in the middle of a simulation. In the future, a great deal of effort needs to be directed towards developing and justifying methodologies for generating good snapshot sets8. After all, a POD or CVT basis is only as good as the snapshot set used to generate it.

2.4 Proper Orthogonal Decomposition (POD) Spaces In order to create a reduced basis space onto which the governing equations are projected, one can find many techniques in literature such as the Proper Orthogonal Decomposition (POD), Proper Generalized Decomposition (PGD), as well as Reduced Basis (RB) method with a greedy approach. The POD approach is been selected here. The POD consists into the decomposition of the flow fields into temporal coefficients ai(t) and orthonormal spatial bases φi(x): Ns

u(x, t) =  a i (t) i (x) i =1

Eq. 2.1 where φi(x) are orthonormal spatial bases that minimizes the average of the error between the snapshots and their orthogonal projection onto the bases and Ns is the number of considered snapshots. The POD space VPOD = span(φ1, φ2, , , , φNs) is then constructed solving the following minimization problem:

VPOD

NS 1 Ns = arg min (u n (x), i (x)) L2 (  ) i (x)  u n ( x) -  NS n =1 n =1

where

2

L2 (  )

Eq. 2.2

(i (x),  j (x)) = δij

where un is a general snapshot of the velocity field at time t = tn. The snapshot can be numerical solutions of the NSEs (typical from LES and DNS simulations or even by the RANS equations) or they are obtained from experimental results. The POD basis minimizes the difference between the John Burkardt, Qiang Du, Max Gunzburger & Hyung-Chun Lee, “Reduced order modeling of complex systems”, NA03 Dundee 2003. 8

22

snapshots and the projection of the snapshots on the spatial modes in the X-norm, given the orthonormality of the modes. If the L2-norm is chosen, the POD basis is optimal considering the energy contained in the snapshots. Following development in9. It can be shown that this problem can be solved computing a singular value decomposition of the so called snapshots matrix. 2.4.1 Galerkin Projection into POD Space In this section the Galerkin projection of the governing equations onto the POD space is highlighted and discussed. The idea here is to consider both the momentum conservation and continuity equation. In order to be consistent with the full order solver, the same set of equations are considered, namely the momentum conservation and the Poisson equation for pressure. 2.4.2 Case Study - Vortex Shedding Around a Circular Cylinder using a POD-Galerkin Method Vortex shedding around circular cylinders is a well-known and studied phenomenon that appears in many engineering fields. In this work a Reduced Order Model (ROM) of the incompressible flow around a circular cylinder, built performing a Galerkin projection of the governing equations onto a lower dimensional space is presented. The reduced basis space is generated using a Proper Orthogonal Decomposition (POD) approach. In particular the focus is into: ➢ The correct reproduction of the pressure field, that in case of the vortex shedding phenomenon, is of primary importance for the calculation of the drag and lift coefficients; ➢ For this purpose the projection of the Governing equations (momentum equation and Poisson equation for pressure) is performed onto different reduced basis space for velocity and pressure, respectively; ➢ All the relevant modifications necessary to adapt standard finite element POD-Galerkin methods to a finite volume framework are presented. The accuracy of the reduced order model is assessed against full order results. 2.4.2.1 Governing Equations For the moment, we consider the incompressible Navier–Stokes equations without any turbulence treatment as

.u = 0

,

u t + (u.)u - νΔu + p = 0

Eq. 2.3

where u is the velocity, p is a normalized pressure and υ is the kinematic viscosity. The equations are given in a domain Ω with proper boundary and initial conditions. The Finite Volume method is a discretization method based on a “balance” approach, well suited for the solution of equations based on conservation laws. A local balance, obtained from the discretization of the integral form of the governing equations, is written on each discretization cell. As for details, readers should consult 1011. This approach can be interpreted as if the state vector of the variables of interest was expanded as linear combination of state vector spatial modes:

Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, “Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communications in Applied and Industrial Mathematics ISSN 2038-0909, 2017. 10 Stefano Lorenzi, Antonio Cammi, Lelio Luzzi, Gianluigi Rozza, “POD-Galerkin method for finite volume approximation of Navier–Stokes and RANS equations”, Comput. Methods Appl. Mech. Engr. 311 (2016) 151–179. 11 Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, ”Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communication Appl. Ind. Math. 9 (1), 2017, 1–21. 9

23

 u(x, t)   u r (x, t)  Nr   i ( x)         F(x, t)    Fr (x, t)  =  a i (t) ψi (x)   p(x, t)   p (x, t)  i =1  χ ( x)     r   i 

Eq. 2.4

Replacing the velocity u with ur and p with pr in Error! Reference source not found., employing the a pproximated face flux Fr in the convective term, and applying the Galerkin projection. The reduced order model of the momentum equation is obtained performing an L2 orthogonal projection onto the reduced bases space VPOD spanned by the POD velocity modes with a procedure similar to what presented in Eq. 2.2.

(i , u t + (u.)u - νΔu + p)L2 ( ) = 0 Eq. 2.5

With respect to what presented in Eq. 2.2 here also the gradient of pressure is considered inside the momentum equation. It is assumed that velocity and pressure modes share the same temporal coefficients. Substituting the POD approximations of u, F and p into Eq. 2.5 and exploiting the orthogonality of the POD modes, one obtains the following dynamical system of Ordinary Differential Equations (ODEs). The following POD-Galerkin ROM for Finite Volume discretization (POD-FV-ROM) is obtained as:

da j (t)

Nr

Nr Nr

Nr

k =1 i =1

i =1

= ν  B ji a i (t) −  C jkia k (t)ai (t) −  A ji a i (t)

dt i =1 B ji = ( j , Δi ) L2

,

C jki = ( j , .(ψ k , i )) L2 , A ji = ( j , χ i ) L2

Eq. 2.6

2.4.2.2 Details of the Full Order Simulation The convective term is discretized in space making use of the Gauss's theorem). The face center values of the variables are obtained from the center cell ones, which are the numerical problem unknowns, with an interpolation scheme consisting into a combination of a linear and upwind scheme. The diffusive term is discretized in a similar fashion. In this case though, a central differencing interpolation scheme with non-orthogonality correction is preferred. Also the pressure gradient is discretized making use of Gauss's theorem. Here, the face center pressure values are obtained from the cell center ones by means of a linear interpolation scheme, in which a limiting operation on the gradient is performed so as to preserve the monotonicity condition and ensure that the extrapolated face value is bounded by the neighboring cell values. As for the time discretization, a backward Euler scheme is used. The overall time extent of the simulation is equal to T = 3645s, which is sufficiently long to reach a perfectly periodic response of the lift and drag forces. The simulation is run in parallel on 4 Intel R CoreTM processors, taking TCPU-HF = 1483s ≈25min to be completed. 2.4.2.3 Details of the ROM Simulation The ROM is constructed using the methodologies described in x 3. For the generation of the POD spaces, we considered 120 snapshots of the velocity, mass flux and pressure fields. The snapshots are collected in a time window covering approximately 1.5 periods of the vortex shedding phenomenon. More precisely, the last 73s of the HF simulation are used. The first two modes for velocity and pressure field respectively are presented in Figure 3 and 4. The ROM simulations are

24

carried out using different values of the POD velocity space dimension Nu = 3; 5; 7; 10. The dimension of the POD pressure and mass ux space is set equal to the dimension of the velocity POD space Nu = Np. The ROM simulation is run in serial, on the same processor used for the HF simulation. In this case, the time advancing of the ROM problem is carried out using the Matlab ODE suite. Reproducing the full 3645s extent of the high fidelity (HF) simulation requires, using the ROM model with the highest dimension of the POD space, approximately TCPU-ROM = 9.10s. This corresponds to a speedup SU≈ 650. 2.4.2.4 Analysis of the Results Using the settings described in the previous paragraph, four different ROM simulations are run, each featuring a different value of the POD space dimension. The results are compared with those of the High Fidelity (HF) simulation in terms of history of the lift and drag coefficients. The time window used for the comparison is the same window used for the collection of the snapshots. The lift coefficient comparison is reported in12, while the drag coefficient time histories is presented in Figure 2.2. Figure 2.3 the comparison is shown directly on the velocity and the pressure fields. In this case, the time step considered is the last one of the simulations corresponding to T = 3645s. The left plot in Figure 2.3 refer to the velocity (top) and pressure (bottom) fields computed with the high fidelity simulations. The right plots refer to the velocity (top) and pressure (bottom) fields computed with the ROM, in which the POD space dimension has be set to Nu = 10. The plots show that, at a glance the HF and ROM solutions cannot be distinguished.

Figure 2.2

Comparison of the drag coefficient obtained with the High Fidelity (HF) and ROM simulations

Giovanni Stabile, Saddam Hijazi, Andrea Mola, Stefano Lorenzi, Gianluigi Rozza, ”Advances in Reduced order modelling for CFD: vortex shedding around a circular cylinder using a POD-Galerkin method”, Communication Appl. Ind. Math. 9 (1), 2017, 1–21. 12

25

2.5 Addressing Challenges in Reduced-Order Modeling One of applied mathematics’ great contributions is the foundation it provides for simulating physical phenomena. From the derivation of consistent, stable, and convergent discretization schemes to the development of efficient parallel solvers, mathematical advances have enabled the ubiquitous nature of modeling and simulation in applications ranging from protein-structure prediction to aircraft design. Today, the predictive capability of validated computational models allows simulation to replace physical experimentation in many scenarios, which facilitates the realization of deeper analyses and better designs at lower costs. However, there is a catch: the resolution required to achieve such high fidelity leads to large-scale models whose simulations can consume weeks on a supercomputer. This creates a massive gap between the simulation times of high-fidelity models and the rapid time-to-solution demands of time-critical (e.g., real-time analysis) and many-query (e.g., uncertainty quantification) applications in engineering and science13. To bridge this gap, researchers have pursued reduced-order modeling which integrates techniques from data science, modeling, and simulation as a strategy for reducing the computational cost of such models while preserving high levels of fidelity. First, these methods execute analyses (e.g., simulating the model, solving Lyapunov equations) during an off line ‘training’ stage; these analyses generate data that can be mined to extract important physical features, such as low-dimensional solution manifolds and interpolation points for approximating nonlinear functions. Next, these techniques reduce the dimensionality and computational complexity of the high-fidelity model by projecting the

Figure 2.3 13

Comparison between velocity and pressure High Fidelity (HF)-ROM

Kevin Carlberg, ”Addressing Challenges in Reduced-Order Modeling”, SIAM News March 2016.

26

governing equations onto the low-dimensional manifold and introducing other approximations where necessary. The resulting reduced-order model (ROM) can then be rapidly simulated during an online ‘deployed’ stage. While significant advances have been made in reduced-order modeling over the past fifteen years, many outstanding challenges face the community, especially with respect to applying model reduction to parameterized nonlinear dynamical systems. To address this, One workshop theme focused on applying ROMs to truly large-scale nonlinear problems in engineering and science. To motivate this, an invited speaker provided a number of compelling examples in which the computational cost incurred by such models poses a major bottleneck to design engineers across the naval, aerospace, and automotive industries. A number of challenges arise in this case. First, ROM techniques must be tightly integrated with the original high fidelity simulation code because most nonlinear ROM methods realize computational savings by performing computations with the high-fidelity model on a small subset of the computational domain. Second, ensuring accurate ROM solutions can be challenging due to the complex dynamics (e.g., stiffness) exhibited by many large-scale dynamical systems. Finally, when the model is very large scale, the computational costs of both the offline training and online deployment can remain prohibitive; devising ways to reduce them is often essential. A second major workshop theme focused on applying ROMs to design optimization. These manyquery problems which are often formulated as mathematical optimization problems constrained by partial differential equations can require hundreds of simulations (and sensitivity analyses) of the computational model. Thus, rapid model evaluations are necessary when faced with time or resource constraints. Louis Durlofsky14 proposed a related method based on the Trajectory Piece Wise Linear (TPWL) ROM, and showed promising results on oil-production optimization under water injection. Despite the many challenges, model reduction remains an exciting research area that is making rapid progress toward bridging the gap between high-fidelity models and time-critical applications in engineering and science.

2.6 Reduced Order CFD Simulation The unsteady Euler and Navier-Stokes solutions have thousands of degrees of freedom. This means that the costs of unsteady flow studies are prohibitive. Schemes that retain the accuracy of the full non-linear methods, but at a reduced cost will make such studies feasible. This is the rationale for Reduced Order Models (ROM) which is based on statically non-linear flow solutions, but with a dynamically time linear approach have been developed. Thus unsteady flows that are a small perturbation about a steady flow with shocks and separations are modelled. This makes ROMs ideal for applications such as flutter clearance and aero-servo-elasticity. To generate a ROM about a particular non-linear mean solution, the dynamically time linear response must be extracted from the CFD code. A system identification and reduction scheme is then used to construct the ROM, a state space system, from the pulse responses. This system is of much lower order than the original non-linear CFD scheme, but is able to reproduce its behavior. The ROMs are in state space form and so can easily be coupled to a structural model for aero elastic and aero servo elastic calculations. One advantage of the current approach is that the aerodynamic model is constructed independently of the structural model and thus a redesigned structure does not require a new ROM. It could be shown that the flutter boundary of a 2D airfoil can be reproduced by ROM of order 18, where the original CFD is of order 27,000. The use of ROMs enabled each flutter point to be calculated in less than 1/100th of the computing time compared to the full CFD. Over the years, 1D, 2D, 3D CFD software’s solutions have been used successfully for modeling thermo-fluid systems in automotive, aerospace, oil, gas, power and energy industries. 1D CFD systems allows analyses of a wide range of complex engineering problems. For example, engineers He, J., & Durlofsky, L.J. “Reduced-order modeling for compositional simulation by use of trajectory piecewise linearization”. SPE Journal, 201 14

27

can rapidly and accurately analyze piping network of almost any size or complexity to establish design integrity. 2D CFD cross-section simulation is mostly used in airfoil design for aircraft or blade in pumps or compressors, turbines of turbo-machinery. However, the 3D CFD phenomena associated with these designs cannot be resolved in 2D simulations. In real world, all fluid flow problems are 3D in nature and very with time, however, with thoughtful care, the simulation of many components and systems can be run in fewer dimensions. If done properly, the results from a simplified solution can give just as meaningful insight, but with a fraction of computational effort. While the 1D CFD is best used for system level analysis to understand how different parts of a of system will interact, the 3D CFD is used for component level analysis to understand design tradeoffs of detailed parts design, as shown Figure 2.4. In summary, while 1D CFD are typically mush faster than 3D CFD calculations where it may take only minutes to perform and provide a relatively quick system overview, the 3D CFD is mainly used for design of individual components allowing engineers to understand how detail flow interacts with all manner complex geometry15.

Figure 2.4

1D vs 3D Analysis

The question arises when to used 1D CFD vs. 3D CFD? While there is not a definitive answer, the strength and weakness of each approach lend themselves to two fairly defined arguments. When designing a single components or small subset of components, every inch of length or degree of curvature can make a difference. In these cases, when small changes to a single part of a system are crucial, or there are significant flow variations in multiple dimensions, 3D CFD is the obvious choice because of its ability to analyze complex geometry with extreme accuracy. However, these benefits come with drawbacks, which become more evident as the scale of design increases. When the design reaches beyond the component level, the computation requirement becomes too high and the simulations take too long to fit within development schedule. This is when 1D CFD is a good choice. Because the 1D approach simplifies the 3D geometry to the component level, usually characterized by some sort of performance data. This is uses much less computing power and usually faster than a comparable 3D model.

15

Mentor Graphics Corporation®.

28

2.7 Case Study 1 - Designing Parameters of Test Stage Axial Turbine

At the present, turbo-machine element design using integrated software is developing intensively. Using 3D simulation in turbine flow path remains very labor intensive and sufficiently hampers its usage. Therefore, unidirectional (1D) and axisymmetric (2D) analyses are still widely used. Gas turbine engine qualitative characters are determined by the concepts taken into account on early phases of engine component design. The turbine multidisciplinary optimization problems are topic of different research. After 1D mean line Stage Design P1 P2 calculation, a stageInlet Pressure, Pa 117000 130000 by-stage 2D (axisymmetric) Inlet Temperature, K 373 373 calculation was Outlet Pressure, Pa 100000 100000 performed to determine the twist Rotation frequency, 1/s 7311 8212 laws of blades which Nozzle vane mean diameter, m 0.2978 0.2978 provide the highest Nozzle vane length, m 0.0822 0.0822 efficiency. The first design stage (P1), was Blade mean diameter, m 0.2986 0.2986 a prototype of Blade length, m 0.0854 0.0854 Intermediate Pressure turbine (IP) Nozzle vane outlet gauge 20 17.2 last stage of the large Nozzle vane at mean radius 24 17.5 steam turbine with reaction at mean Nozzle vane at peripheral radius 28 17.8 radius such that Blade outlet gauging angle near hub 32 41 provide axial flow exit from the stage and Blade at mean radius 29.7 26 had a twist by the law Blade at peripheral radius 26 19 of free-vortex design . The second design of stage (P2) was Table 2.1 Main Parameters of the Test Stages (P1) and (P2) purposed for testing the possibility of increase a load, preserving axial flow exit. The main parameters of the stage put on trial are presented on. In addition to the stage integral characteristics, axisymmetric computations provide the flow parameters distribution in axial gap along radius. At this method of loss components estimation along the radius is a subject of importance. The secondary losses were connected at the

1-D

•1-D basic •pitch-line / mean-line •cotour geometry blade parametres

2-D

Figure 2.5

•Blade Rverse Egineering •BladeBlade •ThroghFlow

Turbine Flow Design Process

3-D

•3-D multiStage calculation •3-D flow phenamena performanc e

29

blade tip by a special algorithm. The secondary losses were calculated for each station along the blade height fitting a local profile loss magnitude. In summary, the process can be envisioned through Figure 2.5. 2.7.1 Blade Reverse Engineering as applied to Geometry Definition The airfoil planner shape can be derived by six control points using NURBS ((Non-Uniform Rotational B-Splines), defined as preliminary design. The airfoil geometry is generated on a planner design sections with sections arranged along the blade height following a selected rule16. A turbine designer may choose an approach for profiling, when the sections are profiled along the direction of streamlines. Then, airfoil centroids are placed upon a redial line where a skeleton generated from the section is covered with a surface that is a NURBS. In a process of planner sections constructions, a technique of profile shape optimization on the geometry and aerodynamics was applied. The blades that were used in the test turbine stage are the subject of particular interest from several points of view. First of all, the nozzle vane cascades are assembled from the profiles supplied with the trailing edge extensions. This is characterized by heightened strength properties at reasonably high efficiency and low sensitivity to inlet flow angle variation. Then, the specialty profiled cascades with divergent channels in hub zone capable of to provide hub reacting at moderate loss were used that permits increase loading. 2.7.2 3D Aerodynamic Computation Experience proves that any problem solved in 3D formulation which obviating 1D and 2D analyses is fought with a danger of missing in flow rate and efficiency determination, particularly when the shape defined with low accuracy. At the same time, unidirectional and axisymmetric components feature high reliability, high speed of operation and accuracy sufficient for conventional turbine design. 3D analysis is a laborious and sophisticated tool and modeling time invested is several orders of magnitude larger than 1D and 2D model. In addition, the designer needs to process and maintain specialize skilled for mesh generation, turbulence model selection, boundary application, etc. Indeed, all Figure 2.6 Profile of Blades forthcoming of 3D analyses is compensated by its capabilities to quantitatively count the flow nuances such as secondary effect in the cascade and flow separation, which cannot be precisely detected in the low fidelity models.

2.8 Case Study 2 - Cooling Air Flow Rate The flow rate of cooling air through the heat exchangers is obviously a key parameter defining the performance of the system. It comes from two sources – the fans and the ram effect generated by the movement of the vehicle through the atmosphere. As far as the ram air is concerned there is a tradeoff between a desired high flow rate for good heat exchanger performance and a low flow rate for minimization of overall vehicle drag. The flow of air through the front end of the vehicle adds a typical 5% to the vehicle drag. There are three basic approaches to the establishment of front end air flow rate under particular operating conditions. At the most sophisticated level complete CFD analyses Moroz, L., Govorusсhenko, Y., Pagur, P., “Proceedings of GT2005 ASME Turbo Expo 2005: Power for Land, Sea and Air “, June 6-9, 2005, Reno-Tahoe, Nevada, USA. 16

30

can be performed which model the detail of the air flow around the outside of the complete vehicle and through the vehicle front end and engine compartment including the various heat exchangers and even through the rotating fans. Figure 2.7 shows the CFD analysis of flow through vehicle front-end with streamlines and pressure contours. This method delivers a great deal of information about the system but is demanding in terms of Figure 2.7 Typical Cooling System Network for Airflow Rate computing effort17. At the intermediate level of complexity commercial software exists that allows networks of 1D components to be set up and the air flow distribution through them to be calculated. This can be valuable when problems arise such as air recirculation or significant temperature and/or flow rate distribution.

2.9 Reduced Order Model using Empirical Relationship Despite the sophistication of these development tools there exists a much simpler tool for the prediction of front end air flow that proves to be capable of delivering considerable insight into the way the system is behaving. It is based in a 1D model that characterizes the face air flow velocity through the heat exchangers, vR, in terms of a few non-dimensional constants:

 Fv + ψ 0 u  vR =   1 + ζ R + ζ sys + ζ F  2 0

2 0

1 2

Eq. 2.7

Where: F measure of the effectiveness of the front end shape in delivering V0 vehicle velocity ψo maximum non-dimensional pressure coefficient of the fan u0 fan tip speed ζR pressure drop coefficient of the heat exchangers ζSys pressure drop coefficient of the remainder of the system (grill, engine compartment, etc.). ζF pressure drop coefficient for the fan itself. The appropriate values for the unknown system and fan constants (F, ψo, ζSys, ζF) are determined from wind tunnel measurements of air flow rates through the heat exchangers for ranges of different vehicle speeds and fan speeds. The pressure drop coefficients of the heat exchangers as functions of air flow rate are already known. The values of the system constants are extracted from the experimental dataset using non-linear optimization techniques. Knowledge of the values of these parameters for a system allows the air flow rate through the heat exchangers to be explored for any vehicle speed or fan speed and even allows the effects of different heat exchangers to be evaluated18. 17 18

Mentor Graphics Corporation 2012. See previous.

31

32

3 Computational Error and Uncertainty Quantification CFD is still a tool which requires that a user has a good understanding of uncertainties and errors that might spoil a CFD simulation. There exists no error control in CFD and any CFD simulation must be interpreted by an experienced user to have some credibility. Without some knowledge about possible errors and how they can be handled a CFD simulation cannot be trusted. Errors can occur at different places: • • • • • • • •

Definition of the problem; What needs to be analyzed; Selection of the solution strategy; What physical models and what numerical tools should be used; Development of the computational model; How should the geometry and the numerical tools be set up; Analysis and interpretation of the results; How should the model be analyzed and the results be interpreted;

There exist many different definitions on errors. In this guide the errors are classified into four types of errors: • • • •

Problem definition errors, Model errors, Numerical errors and User and Code errors.

The sections below describe these errors and give some guidelines on how to avoid them. While the Uncertainty Quantification (UQ) and Error are commonly used interchangeably in everyday language, some basic definitions are warranted here. We follow the definitions of the AIAA Guidelines which defines Uncertainty as: "A potential deficiency in any phase or activity of the modeling process that is due to the lack of knowledge." And Error as: “A recognizable deficiency in any phase or activity of modeling and simulation that is not due to lack of knowledge”. (AIAA G-0771998). The key phrase differentiating the definitions of uncertainty and error is lack of knowledge. The key word in the definition of uncertainty is potential, which indicates that deficiencies may or may not exist. Lack of knowledge has primarily to do with lack of knowledge about the physical processes that go into building the model. Sensitivity and uncertainty analyses can be used to better determine uncertainty. Uncertainty applies to describing deficiencies in turbulence modeling for example. There is a lot about turbulence modeling that is not understood. One approach for determining the level of uncertainty and it effect on one's analysis is to run a number of simulations with a variety of turbulence models and see how the modeling affects the results19. The definition for error implies that the deficiency is identifiable upon examination. One can differentiate between local and global errors. Local errors refer to errors at a grid point or cell, whereas global errors refer to errors over the entire flow domain. We are interested here in the global error of the solution that accounts for the local error at each grid point but is more than just the sum of the local errors. Local errors are transported, advected, and diffused throughout the grid. The definition of error presented here is different than that an experimentalist may use, which is "the difference between the measured value and the exact value". Experimentalist usually defines uncertainty as "the estimate 19

Responsible NASA Official/Curator: John W. Slater

33

of error". These definitions are inadequate for computational simulations because the exact value is typically not known. Further these definitions link error with uncertainty. The definitions provided in the above paragraphs are more definite because they differentiate error and uncertainty according to what is known.

3.1 Classification of Errors Here we provide a classification or taxonomy of error; namely Acknowledge Errors and Unacknowledged Errors. According to NASA, acknowledged errors (examples include round-off error and discretization error) have procedures for identifying them and possibly removing them. Otherwise they can remain in the code with their error estimated and listed. Unacknowledged errors, on the other hand, (examples include computer programming errors or usage errors) have no set procedures for finding them and may continue within the code or simulation. 1 - Acknowledged Errors ➢ Modeling Errors • Physical Modeling Errors • Geometry Modeling Errors ➢ Discretization Errors • Spatial Discretization Errors (Discretization of Governing Equations) • Domain Discretization Errors (Grid Generation) ✓ Grid Density (Grid Independence Study) ✓ Topology ✓ Grid Sensitivity ✓ Error Estimation ❖ High-Ordered Accurate (Richardson’s Extrapolation) ❖ Residual Based (Truncation Method) • Temporal Discretization Errors ➢ Iterative Convergence Errors ➢ Computer Round-off Errors ➢ Truncation Errors 2 – Un-acknowledged Errors ➢ Code Errors ➢ Usage Errors Each of these types of errors will be discussed below.

1.1 Physical Modeling Error Physical modeling errors are those due to uncertainty in the formulation of the model and deliberate simplifications of the model. These errors deal with the continuum model only. Converting the model to discrete form for the code is discussed as part of discretization errors. Errors in the modeling of the fluids or solids problem are concerned with the choice of the governing equations which are solved and models for the fluid or solid properties. Further, the issue of providing a well-posed problem can contribute to modeling errors. Often modeling is required for turbulence quantities, transition, and boundary conditions. Mehta lists sources of uncertainty in physical models as

34

• • • •

The phenomenon is not thoroughly understood; Parameters used in the model are known but with some degree of uncertainty; Appropriate models are simplified, thus introducing uncertainty; Experimental confirmation of the models is not possible or is incomplete.

Even when a physical process is known to a high level of accuracy, a simplified model may be used within the CFD code for the convenience of a more efficient computation. Physical modeling errors are examined by performing validation studies that focus on certain models (i.e. inviscid flow, turbulent boundary layers, real-gas flows, etc...). It is essential to have an overview of the physics involved and how the problem can best be analyzed. Running a 2D simulation in order to understand secondary flows or running a steady simulation in order to understand transient behavior is of course no use. When assessing a CFD simulation the first thing to consider is what physical phenomena are important for the results and if the selected type of simulation is suitable to resolve this type of phenomena. For further information about selecting the most suitable type of simulation please see the previous chapter on deciding what type of simulation to perform. Once the type of simulation has been selected the next step is to select what type of physical models the simulation should use. The following points should be considered: • • •

Gas data (incompressible/compressible, perfect gas/real gas, ...) Turbulence modeling (type of model, type of near-wall treatment, ...) Other models (combustion, sprays, ...)

When assessing model related errors, it is important to know the features of the selected model and think carefully how these features and possible short comings might affect the predicted physical behavior. Using the wrong turbulence model or combustion model can completely destroy the results of a CFD simulation. 3.1.1 Uncertainty Quantification of Turbulence Models20 The uncertainty arising due to the turbulence model form has been described as the greatest challenge in simulation based design in aerospace applications21. A vast majority of CFD computations rely on eddy-viscosity based closures, such as the k-ε and k-ω models. Due to assumptions and simplifications introduced in their formulations, these models are limited in the characteristics of turbulent flows that they can replicate and their overall fidelity. Such simplifications include coarse graining where it is assumed that the Reynolds stress tensor can adequately describe the turbulent flow field, the eddy viscosity hypothesis that assumes the Reynolds stress anisotropy to be proportional to the mean rate of strain, the application of the gradient diffusion hypothesis, besides others. These simplifications introduce structural uncertainty in the predictions of such turbulence models22-23. While both reliable methodologies and validated tools 24 A. Mishra, J. Mukhopadhaya , G. Iaccarino , J. Alonso, “Uncertainty quantification of turbulence models for complex aerospace flows”, Stanford University, 2018. 21 T. A. Zang, M. J. Hemsch, M. W. Hilburger, S. P. Kenny, J. M. Luckring, P. Maghami, S. L. Padula, and W. J. Stroud, “Needs and opportunities for uncertainty-based multidisciplinary design methods for aerospace vehicles, NASA/TM-2002-211462," tech. rep., NASA Langley Research Center, 2002. 22 B. Launder, D. Tselepidakis, and B. Younis, “A second-moment closure study of rotating channel flow," Journal of Fluid Mechanics, vol. 183, pp. 63{75, 1987. 23 T. Craft, B. Launder, and K. Suga, “Development and application of a cubic eddy viscosity model of turbulence," International Journal of Heat and Fluid Flow, vol. 17, no. 2, pp. 108{115, 1996. 24 B. H. Thacker, D. S. Riha, S. H. Fitch, L. J. Huyse, and J. B. Pleming, “Probabilistic engineering analysis using the nessus software," Structural Safety, vol. 28, no. 1, pp. 83{107, 2006. 20

35

are available to quantify the uncertainty due to other sources, there are no strategies or tools available to estimate the epistemic uncertainty due to turbulence model form. There are two major effort can be contributed as ➢ Using Single turbulence model (e.g., RANS) with perturbation ➢ Using Different models without perturbation 3.1.2 Single Turbulence Model with Perturbation The uncertainty estimation library utilizes the Eigenspace Perturbation methodology, where the underlying rationale and function thereof are detailed in25-26. The focus here is on the results of the library using the same, thus, in this section, we provide a brief overview. To account for the errors due to closure assumptions, the spectral representation of the modeled Reynolds stress tensor is perturbed, during CFD solution iterations. The process of deriving uncertainty estimates from perturbed simulations is schematically reported in Figure 3.1. The central panel outlines the unperturbed, baseline CFD solution, giving a unique flow field realization in the domain and a single profile for the QoI at the highlighted region(here, the mean velocity at x/H = 24 in the diffuser). The upper and lower panels of the figure outline perturbed solutions (only two of the perturbations are shown for clarity). Each perturbation leads to a different realization of the flow field, leading to a different profile for the QoI. The uncertainty estimates on the profiles of a QoI at a location are engendered by the union of all the states lying in the profiles from this set of perturbed RANS simulations. This is illustrated by the gray shaded zone on the right of Figure 3.1. The results presented in the following section focus on uncertainty quantification in complex turbulent flows relevant to aerospace design. In all cases, the k-ω SST model was used and the unperturbed model prediction are reported as the baseline solution.

Figure 3.1

Schematic Composition of Uncertainty Estimates in a Diffuser

G. Iaccarino, A. Mishra, and S. Ghili,,”Eigenspace perturbations for uncertainty estimation of single-point turbulence closures," Physical Review Fluids, vol. 2, no. 2, 2017. 26 M. Emory, J. Larsson, and G. Iaccarino, “Modeling of structural uncertainties in Reynolds-averaged navierstokes closures," Physics of Fluids, vol. 25, no. 11, 2013. 25

36

(a) y/b = 0.44 Figure 3.2

(b) y/b = 0.65

Variation in CP at y/b = 0.4 and 0.65 along the ONERA M6 wing

3.1.2.1 Transonic Flow over the ONERA M6 Wing The ONERA M6 design is a low aspect ratio, swept, tapered wing model. The transonic ow over this configuration manifests shock-boundary layer interactions, leading to significant challenges for eddy-viscosity based models[22]. We perform simulations of this case at M = 0.84, α = 3.06 deg , and Re = 11.72 x106, corresponding to the conditions of [23] representative of the actual flight of military and civilian aircraft. The reference data consists of Coefficient of Pressure (CP ) measurements at fixed spanwise locations. We outline this case as a test against the false positive. In locations where RANS model discrepancy is significant, the uncertainty bounds should indicate this. However, locations where the RANS predictions are accurate, having spurious uncertainty bounds that are significant in their extent would be misleading and amount to a false positive. In the ONERA wing, the RANS discrepancy is restricted to the suction surface and the RANS predictions are accurate on the pressure surface. This is reflected in the uncertainty bounds in Error! Reference source not found. , that are substantial only on the suction surface at the locations of the shock and near the trailing edge. In Figure 3.3, we outline a qualitative spatial comparison of the uncertainty magnitude of the Cp and the location of the shock. The uncertainty in RANS predictions is substantial at the trailing edge and the locations of the shock. Physically, RANS models are unable to capture the complex shock-

Figure 3.3

Comparison of (a) the Uncertainty Estimates, versus (b) Shock on the Surface of the ONERA M6 Wing.

37

boundary layer interactions and should have deficient predictions at the shock locations, as is suggested in the figure. Additionally, the figure highlights the capability of the UQ library to provide spatial estimates on the reliability of RANS predictions, which may be invaluable to guide design decisions. 3.1.2.2 Supersonic Flow Through a Converging-Diverging Seiner Nozzle Eddy-viscosity based models are the workhorse for the design of aircraft exhaust nozzles, but have significant limitations for jets with three-dimensionality, compressibility, and high temperature streams. With the advent of increasingly stringent regulations on aircraft noise and efficiency, the industry is exploring new designs such as lobed mixers, chevrons, etc. 27In this scenario, it is critically

M

P/Pexit

T/Texit

Figure 3.4

Variation in Mach number, temperature and pressure along the centerline of the jet efflux x/Djet

J. M. Seiner, M. K. Ponton, B. J. Jansen, and N. T. Lagen, “The e_ects of temperature on supersonic jet noise emission," in 14th DGLR/AIAA aeroacoustics conference, vol. 1, pp. 295-307, 1992. 27

38

important to have tools that can guide this design process by providing reliable uncertainty estimates for the RANS model predictions. To this end, we replicate the experiment configuration. The experiment was conducted on an axisymmetric, convergent-divergent nozzle with diameter 9.144cm. The jet exhausted into a quiescent ambient, with M = 2.0 and Reynolds number 1.3 x 106. As the results in Figure 3.4 indicate, the SST model over-predicts the potential core and the rate of mixing is lower than that indicated in the experimental data. A major source of RANS discrepancy is due to the incorrect prediction of the relative strengths of turbulent kinetic energy production and dissipation. The perturbations on the eigenvectors of the Reynolds stresses modulate their alignment with the mean rate of strain eigendirections. This varies and gives bounds on the relative strengths of the production and dissipation mechanisms. The resultant uncertainty estimates are able to account for the discrepancy for the Mach number, temperature and pressure along the centerline of the jet efflux. 3.1.3 Multiple turbulence Models without Perturbation Figure 3.5 depicts effects of different turbulence models on a turbine blade vortex at 20% chord length. Another example would be effect of different turbulence models on a steep obstacle as shown in Figure 3.6. As it evident, in both cases, it results in slightly different solution, and therefore, source of uncertainty and error.

κ-ε

LES

Figure 3.5

κ-ω SST

Reynold Stress

Pressure Coefficient at 20% Chord Length using Different Turbulence Model

39

Figure 3.6

Effects of Different Turbulence Models in a Steep Obstacle

3.2 Geometrical Modeling Errors It is almost always necessary to simplify the geometry in some form. When assessing a CFD simulation one should consider how the geometrical simplifications can affect the interesting physical phenomena. Typical geometrical errors are: Simplifications Small geometrical features like fillets, small steps or gaps etc. can often be disregarded. When disregarding this type of features one should consider if they might affect the important physics. For example, a very large fillet on the suction-side of a vane might affect corner separations near the end-walls. A large tip-leakage might affect the flow physics significantly in the upper part of a compressor. Tolerances and manufacturing discrepancies if the geometry has very large tolerances or is manufactured in a way which might produce a non-ideal shape or position it might be necessary to perform additional CFD simulations in order to cover the whole span of possible real geometries. For example, surface conditions, roughness, welds, steps, gaps etc. Often CFD simulations assume a perfectly smooth surface. A nonsmooth surface which might have welds, steps or even gaps will of course produce different results. If the physical phenomena of interest might depend on the surface conditions these should of course be considered. Typical phenomena that might be dependent on this type of errors are transition prediction, leakage flows etc.

3.3 Spatial Discretization (Governing Equations) Errors In general, discretization error is defined as the difference between the numerical solution to the discretized equations and the exact solution to the partial differential (or integral) equations. Spatial Discretization of the governing equation is to facilitate the discrete equations over the domain discretization. There are currently four methods of discretization available. They are • • • •

Taylor Series expansion Polynomial fitting Integral method Control Volume approach

40

Each of these has its own characteristics and sometimes possible to obtain exactly the same formulation by using all four method; especially for linear simple cases. Two different idea should be discussed. The first has to do with PDE themselves in regard to terms like “conservation form”, “conservation law form”, or “divergence form”. The difference between conservative and nonconservative representation of equation has been discussed before. Normally for PDE which represents a physical conservation statement, this means the divergence of a physical quantity can be identified in the equation. The second idea is that of the Conservative properties of finite difference representation. Such PDE represents a conservation statement to the point. We strive to construct a finite difference representation which provide a good approximation to PDE in a small, local neighborhood involving a few grid points. The same conservation principal which gave rise to the PDE’s also apply to arbitrary large regions (control volume) through the use of Divergence theorem (Eq. 3.1), by converting a control volume to a control surface, to be integrated through.

⃗ .n ∭(V. ⃗F)dV = ∯(F ⃗ ) dS Eq. 3.1

V

S

Those finite difference schemes which maintain the discretize version of the conservation statement is said to have the conservation property. For majority of problems it is crucial. Differences between finite difference and finite volume method are subtle but could be generalized as: Finite difference • Approximates the governing equation at a point. • Finite difference methods were developed earlier, the analysis of methods is easier and further developed. Finite volume • Approximates the governing equation over a volume. • Finite volume is the most physical in fluid mechanics codes, and is actually used in most codes.

Figure 3.7 Inviscid stencil with 1st order cells in red and 2nd order cells in green

To that end, several discretization techniques were tableted (tables 1 & 2) as previously which would be excellent guide to spatial discretization techniques, both explicit and implicit. Although high order schemes seems to perform better, but the high CPU costs prevent them to be implemented. In balance, based on recommendation from these table, only 2nd order or higher should be considered with their stability in mind. Further, to illustrates this, we Second-order spatial accuracy for the inviscid fluxes is achieved by using MUSCL extrapolation to reconstruct an approximate value of the primitive variables, each side of each cell face. The MUSCL scheme for inviscid uses a 13 point stencil per cell in three dimensions, as shown in Figure 3.7; with 1st order in red and 2nd order cells in green. The viscous flux is calculated using a Green's theorem Figure 3.8 Viscous stencil with approach to calculate the derivatives at cell faces and central viscous cells in blue and 2nd order differencing is used to calculate the scalar values. The viscous cells in green fluxes require a further twelve points to be added to the

41

inviscid stencil for a total of 25 cells in the stencil per cell in three dimensions, as shown in Figure 3.8; with blue and 2nd order cells in green (25 cells). 3.3.1 Higher Order Discretization There is a big debate in CFD community whether to use the 1st or 2nd or higher order discretization. There are of course differences between 1st and 2nd order as depicted in in Figure 3.10. The 2nd order, i.e., central differencing, reaches the desired error much faster than 1st. But it remain murky for higher ordered differences. There are flows which diffusion dominated, versus convection. An example, consider is the supersonic flows (high Reynolds number or convective flows) vs subsonic (diffusive) one. In that case, the rule of thumb is to use the Pe number (Pe = RePr) rendering to Figure 3.9. Now when you plan to discretize the Navier-Stokes equations this notion apply on the discretization of the convective term of the NS equation. While centered scheme is 2nd order accurate when upwind scheme is only first order, the upwind scheme is more diffusive Figure 3.10 Effect of 1st and 2nd Order Differencing Scheme in than centered scheme but it is also Error more stable than centered scheme which can lead to some spurious oscillations (dispersive errors). It is well known than when Peclet number is > 2 it is preferable to switch from centered scheme to upwind scheme. So due to this possible oscillating behavior of the centered scheme, especially on convective dominated flow (high re number) the use of the upwind scheme could stabilize the solution. In conclusion, the QUICK and third-order MUSCL discretization schemes may provide better accuracy than the 2nd order scheme. The QUICK scheme is applicable to quadrilateral or hexahedral meshes, while the MUSCL scheme is used on all types of meshes. In general, however, the secondorder scheme is sufficient and the QUICK scheme will not provide significant improvements in accuracy. There are other fancy higher order schemes which been design for most designed and work in special occasion. For the time being, 2nd or 3rd order accuracy is deem to be Figure 3.9 Effect of Pe Number in balancing Diffusive and Convective Flows sufficient for most applications.

3.4 Discretization Errors 3.4.1

Mesh Density

42

To minimize the effort, users are advised to consult with guidelines developed by ITTC, ASME, or Journal of Fluids Engineering before instigating a numerical simulation. These errors are due to the Discretization errors can either be spatial errors in space or temporal errors in time. Spatial discretization errors are what people normally call discretization errors. The effects of spatial discretization are the most pronounced and have rightfully been investigated in depth. These can be quantified by a sequence of systematic mesh refinement/coarsening as depicted later on. The errors and uncertainties, as well as the sensitivity of the solution with respect to mesh size could be estimated. To minimize the uncertainties and error within a numerical simulation, sets of parametric studies and comparisons are developed. These, are simple and yet time consuming and tedious, especially for real difference between the exact solution and the numerical representation of the solution in space. Describing exactly what discretization different codes use and what errors this might lead to is not possible here. Instead some general rules to avoid these errors can be summarized as: Use at least a 2nd order accurate scheme, preferably a 3rd order accurate scheme. Some general purpose codes have a 1st order upwind scheme as default; this is a very diffusive scheme that often gives too smooth results. For new applications always run a simulation with a finer mesh to see how grid independent your solution is. Be aware of checker-board errors. Checker board errors occur close to strong shocks and other large discontinuities and can be seen as a wavy pattern with a wavelength of two cells. Some schemes, especially those who behave like central differencing schemes, are more prone to checker-board errors. Upwind schemes are a bit better and schemes like TVD or chock-capturing schemes, are even better. (See Figure 3.11).

Figure 3.11

Effects of mesh density on solution domain

43

3.4.2 Grid Independence Study To perform a Grid Independent Study, is fairly straight forward as seen in a compressible flow over a forwarding step size example (Figure 3.11): 1. Run the initial simulation on your initial mesh and ensure convergence of residual error to 10-4, monitor points are steady, and imbalances below 1%. If not refine the mesh and repeat. 2. Once you have met the convergence criteria above for your first simulation, refine the mesh globally so that you have finer cells throughout the domain. Generally we would aim for around 1.5 times the initial mesh size. Run the simulation and ensure that the residual error drops below 10-4, that the monitor points are steady, and that the imbalances are below 1%. At this point you need to compare the monitor point values from Step 2 against the values from Step 1. If they are the same (within your own allowable tolerance), then the mesh at Step 1 was accurate enough to capture the result. If the value at Step 2 is not within acceptable values of the Step 1 result, then this means that your solution is changing because of your mesh resolution, and hence the solution is not yet independent of the mesh. In this case you will need to move to Step 3. 3. Because your solution is changing with the refinement of mesh, you have not yet achieved a mesh independent solution. You need to refine the mesh more, and repeat the process until you have a solution that is independent of the mesh. You should then always use the smallest mesh that gives you this mesh independent solution (to reduce your simulation run time) 3.4.3 Grid Topology Before we pay attention to the individual cell topology, we consider domain topology which are compared for the 2D case, namely H, C, and O topologies. Meshes with H-H and C-H topology were constructed for 3D comparison; however due to the incompatibility of the C-H structure on a sharp wing tip or trailing edge with the current solver, no C-H studies are included. Most of the studies were under lifting inviscid flow conditions. Multiple studies were conducted under turbulent conditions but only one is included. Overall when it comes to topology, the H mesh scores first place followed by the C mesh and the O mesh comes last. When it comes to mesh parameters, the studies show that with carefully chosen mesh spacing around the leading edge, good orthogonality and skewness factors, smooth spacing variation, and a reasonable number of nodes, excellent CFD results can be obtained from the mesh in terms of accuracy of computed functional, determined convergence order and adjoin error estimation. Now with regard to Topology of individual cells, three types are considered; Hexahedral, Tetrahedral, and Polyhedral. The solution on the polyhedral mesh produced the lowest absolute residual value as evident in Fig and the number of iterations for each mesh type to reach the same level of convergence (10-4) for the pressure residual are shown. While there are minor differences in the converged pressure drop the simulations are in broad agreement on the overall value. The number of iterations for each mesh type to reach a steady state value for the

Figure 3.12

Domain Topology (O-Type, C-Type, and H-Type; from left to right)

44

pressure drop are shown accordingly. The number of volumetric counts for Polyhedral cells are less than other two, therefore, saving valuable time and effort in computation. (See Figure 3.12). 3.4.4 Sources of Discretization Error Discretization error occurs during the approximate numerical solution of differential equations. Evaluation of discretization error requires knowing the exact solution for the governing equations which is generally not known for problems of practical interest. In such scenarios, a mathematically rigorous technique called the Method of Manufactured Solutions (MMS) can be used where a solution is manufactured and used as an exact solution. MMS is based upon the philosophy that code verification deals with the mathematics of the problem and hence arbitrary functions (with certain requirements as discussed later) can be selected as exact solutions28. Of the various sources of numerical error, discretization error is generally the largest and usually the most difficult to estimate. The goal here is to review the different approaches for estimating discretization error and to present a general framework for their classification. The first category of discretization error estimator is based on estimates of the exact solution to the differential equation which are higher-order accurate than the underlying numerical solutions and includes approaches such as underlying numerical solutions and includes approaches29. ➢ Residual (i.e., the truncation error) ➢ Gradient and Flux Calculation (different geometries) Residual includes discretization error transport equations, finite element residual methods, and adjoin method extensions. The discretization error has two components: one that is locally-generated and the other is transported from elsewhere in the domain. The transported component is called pollution error by the finite element community which can be used to relate the convergence of the numerical method, (i.e., truncation error). The truncation error is the difference between the discrete equations and the mathematical model equations. Thus the discretization error is transported in the same manner as the underlying solution properties (e.g., it can be convected and diffused) and it is locally generated according to the truncation error. 3.4.5 Case Study – Hypersonic Flow over an Axisymmetric Sphere-Cone An example of error transport for the Euler equations is shown below in Figure 3.13, which gives the error in the density for the inviscid, Mach 8 flow over an axisymmetric sphere-cone (Roy)30. The flow is from left to right, and large discretization errors are generated at the bow shock wave where the shock and the grid lines are misaligned. In the subsonic (i.e., elliptic) region of the flow immediately behind the normal shock, these errors are convected along the local streamlines. In the supersonic (hyperbolic) regions these errors propagate along characteristic Mach lines and reflect off the surface. Additional error is generated at the sphere-cone tangency point, which represents a singularity due to the discontinuity in the surface curvature. Errors from this region also propagate downstream along the characteristic Mach line. An adaptation process which is driven by the global error levels would adapt to the characteristic line emanating from the sphere-cone tangency point, which is not desired. An adaptation process driven by the local contribution to the error should adapt

Aniruddha Choudhary, “Verification of Compressible and Incompressible Computational Fluid Dynamics Codes and Residual-based Mesh Adaptation”, Dissertation submitted to the faculty of the Virginia Polytechnic Institute and State University, 2014. 29 Christopher J. Roy, “ Review of Discretization Error Estimators in Scientific Computing”, 48th AIAA Aerospace Sciences Meeting Including the New Horizons Forum and Aerospace Exposition January 2010, Orlando, FL. 30 Roy, C. J. (2003). “Grid Convergence Error Analysis for Mixed-Order Numerical Schemes”, AIAA Journal, Vol. 41, No. 4, pp. 595-604. 28

45

to the sphere-cone tangency point, thus obviating the need for adaption to the characteristic line that emanates from it.

Figure 3.13

Contours of Total Estimated Discretization Error in Density

3.4.6 Estimating Discretization Error There are a number of approaches available for estimating discretization error. These methods can be broadly categorized as a priori methods and a posteriori methods. The a priori methods are those that allow a bound to be placed on the discretization error before any numerical solution is even computed. One approach to developing an a priori discretization error estimator is to perform a truncation error analysis for the scheme, relate the truncation error to the discretization error (e.g., through a discretization error transport equation), then develop some approximate bounds on the solution derivatives. The main failing of a priori error estimators is extremely is that the resulting error estimate greatly over-estimates the true discretization error. A priori methods are generally only useful for assessing the formal order of accuracy of discretization scheme. These methods provide an error estimate only after the numerical solution has been computed. They use the computed solution to the discrete equations, possibly with additional information supplied by the equations, to estimate the error relative to the exact solution to the mathematical model. The initial developments up to the early-1990s were mainly concentrated on linear, elliptic, scalar mathematical models. Up to this point, a posteriori error estimation was limited to analysis of the energy norm of the discretization error, which for Poisson’s equation can be written on element k as: p

‖ε‖k = [ ∫|∇u ⃗ h − ∇u ⃗ exact |2 dV] Vk Eq. 3.2

46

where ε the discretization error, h is a measure of the element size (e.g., Δx), and p is the formal order of accuracy of the method, Uh represents the solution to the discrete equations on a mesh with a representative cell length of h, and the exact solution to the mathematical model UExact. In general, the level of maturity for a posteriori error estimation methods is strongly problem dependent. All of the discretization error estimators to be discussed here were originally developed for elliptic problems. As a result, they tend to Mesh spacing L2 – norm Slope of segment work well for elliptic problems, but are not as 0.75 0.013831341 N/A well-developed for 0.375 0.00350755 1.97940 mathematical models 0.1875 0.00095693 1.87398 that are parabolic or hyperbolic in nature. The 0.09375 0.00023901 2.00129 level of complexity of the 0.046875 0.00006023 1.98851 problem is also an 0.0234375 0.00001505 2.00001 important issue. The error estimators work 0.0117188 0.00000375 2.00000 well for smooth, linear Table 3.1 Discretization Error for 2D Burger’s Equation problems with simple physics and geometries; however, strong nonlinearities, discontinuities, singularities, and physical and geometric complexity can significantly reduce the reliability and applicability of a posteriori discretization error estimation methods. Similarly, an investigation perfumed by [Yan and Ollivier-Gooch]31, where an Error Transport Equation (ETE) been developed to estimate the discretization error. The ETE is an auxiliary partial differential equation (PDE) derived from the primal one. We compare the accuracy of the resulting discretization error estimate from the linearized ETE and nonlinear ETE to solving the higher order primal problem. It was shown that for finite-volume discretization of 2D viscous burger’s equation on an unstructured mesh, the error estimate can exposed in Eq. 3.2, for comparing visually. For structured grid (Table 3.1), the same applied and the resulted in 2nd order accuracy obtained for different grid density32. Other studies suggest improving finite-volume diffusive fluxes through better reconstruction [Sejekan and Ollivier-Gooch]33, where they believed inaccuracy originates as the error in the flux integral. The aim is to compute the gradient and flux more accurately at the cell boundaries and hence obtain a better flux integral for a slight increase in computational cost. 3.4.6.1 Case Study – Domain Discretization Error for the Transitional Flow over a Sharp Cone An example of using the Richardson extrapolation procedure as an error estimator was presented by [Roy and Blottner]34. They examined the hypersonic, transitional flow over a sharp cone. The system response quantity was the heat flux distribution along the surface. The surface heat flux is shown versus the axial coordinate in Figure 3.14-(a) for three systematically-refined mesh levels: fine (160×160 cells), medium (80×80 cells), and coarse (40×40 cells). Also shown are Richardson extrapolation results found from the fine and medium mesh solutions. The sharp rise in heat flux at x = 0.5 m is due to the specification of the location for transition from laminar to turbulent flow. In Figure 3.14-(b), the Richardson extrapolation results are used to estimate the discretization error Gary Kai Kin Yan, Carl Ollivier-Gooch, “Discretization Error Estimation by the Error Transport Equation on Unstructured Meshes Applications to Viscous Flows”, 54th AIAA Aerospace Sciences Meeting. 32 I. Sadrehaghighi, “Verification & Validation”, Presented in CDI Marine, March 2011. 33 Chandan B. Sejekana, Carl F. Ollivier-Goocha,”Improving Finite-Volume Diffusive Fluxes Through Better Reconstruction”, Computers & Fluids · August 2016. 34 Roy, C. J. and Blottner, F. G. (2003). “Methodology for Turbulence Model Validation: Application to hypersonic Transitional Flows,” Journal of Spacecraft and Rockets, Vol. 40, No. 3, pp. 313-324. 31

47

in each of the numerical solutions. Neglecting the immediate vicinity of the transition location, the maximum estimated discretization errors are approximately 8%, 2%, and 0.5% for the coarse, medium, and fine meshes, respectively. The solutions thus appear to be converging as h → 0. Furthermore, these estimated errors display the expected hp reduction for these formally secondorder accurate computations. In the turbulent region, the maximum errors are also converging at the expected rate giving error estimates of approximately 4%, 1% and 0.25%. (a) - Exact error

Figure 3.14

(b) - Estimated error

Exact error, Estimated error scheme for viscous Burgers’ equation (Courtesy of Yan and Ollivier-Gooch)

3.5 Temporal Discretization Errors Temporal discretization errors mainly effect transient simulations. However, some codes use a time marching method also for steady simulations and then a temporal discretization error might affect the final steady solution slightly. The discretization in time can be done with 1st or 2nd order schemes or a Runge-Kutta Method, which is more accurate and saves memory. Some codes can adapt the timestep, but often it is necessary to prescribe a time-step in advance. Think of the time-step as your grid in time and make sure that the grid-resolution in time is fine enough to resolve the highest in time and make sure that the grid-resolution in time is fine enough Figure 3.15 Temporal Discretization Criteria to resolve the highest frequencies. It is obvious to see the CFL number has a bearing on temporal accuracy. In analogy with (Spatial accuracy) arguments it is easy to see that a very small physical time step will give you very time-accurate results, though it would take more time. Too high a time step, not only reflects as a loss in temporal accuracy but also could affect the stability. adapt the time-step, but often it is necessary to prescribe a time-step in advance. Think of the time-step as your grid in time and make sure that the gridresolution in time is fine enough to resolve the highest in time and make sure that the grid-resolution in time is fine enough to resolve the highest frequencies. It is obvious to see the CFL number has a bearing on temporal accuracy. According to some, compromise therefore is needed from practical engineering approach, To avoid problems with temporal discretization errors the following should

48

be considered; Try to use at least the same order as special discretization in your temporal one. For example, if you use Crank-Nicolson which is 2nd order accurate, use a 2nd order scheme in time as well. For Euler Implicit which is 1st order, use a 1st order accurate, and so on. Accordingly, a smaller time step for Euler implicit scheme is not much helpful. An easier way is to look at "temporal refinement" in the same perspective as "spatial refinement" studies. On the same grid, CN has a slope of 2, when dt reduces, while EI would have a slope of 1. This means that to achieve the same error levels as CN, the dt must be lowered substantially for EI. Do a physical estimation of the typical frequencies in time of the phenomena that you are interested in and select a time-step which is fine enough to resolve these frequencies well. After the simulation also look at the frequencies captured and make sure that they are well resolved by the chosen time-step. For new applications try a finer

(a) Surface Heat Flux

(b) Relative Discretization error

Figure 3.16

Relative discretization error for the transitional flow over a sharp cone

49

time-step to ensure that your solution in time is fairly grid independent in time. This dictates a balancing act between spatial discretization and temporal one, as discussed earlier, and related directly to CFL number. It is customer to choose CFL number of 1 or less to be consistent as depicted to follow. For example, the 1D wave equation could be written as:

∂u ∂u α∆t n +α = 0 ≫ un+1 = unj − (uj − unj−1 ) j ⏟ ∂t ∂x ∆x CFL

And the effect of different CFL numbers as displayed.

3.6 Iterative Convergence Errors To judge when a CFD simulation is converged is not always that easy. Different codes and different applications behave very differently. Before we pay attention to the convergence issue, it is prudent to establish what the proper convergence level for the solution is. According to Wikipedia, in computational mathematics, an iterative method is a mathematical procedure that generates a sequence of improving approximate solutions for a class of problems. A specific implementation of an iterative method, including the termination criteria, is an algorithm of the iterative method. An iterative method is called convergent if the corresponding sequence converges for given initial approximations. A mathematically rigorous convergence analysis of an iterative method is usually performed; however, heuristic-based iterative methods are also common. In the problems of finding the root of an equation (or a solution of a system of equations), an iterative method uses an initial guess to generate successive approximations to a solution. For a pure aero-simulation on a fairly coarse grid convergence is easy to judge, but for more complex simulations involving resolved boundary layers, heat transfer, combustion etc. convergence can be very tricky. Aside from looking at residuals one should always also look at how global parameters like static pressure distributions, total pressure losses, skin friction, heat transfer etc. change in time. To summarize at convergence,

Figure 3.17

Effect of CFL Number on Convergence of 1D Wave Equation

50

the following should be satisfied35: • •

All discrete conservation equations (momentum, energy, etc.) are obeyed in all cells to a specified tolerance OR the solution no longer changes with subsequent iterations. Overall mass, momentum, energy, and scalar balances are achieved.

Simply put, for a time-marching or time-accurate strategy, this involves examining whether the final time has been reached with proper convergence at each time step. For a space-marching strategy, this involves examining whether the end of the marching segment has been reached with proper convergence at each marching step. 3.6.1 Monitoring Convergence using Residual History Generally, a decrease in residuals by three orders of magnitude indicates at least qualitative convergence. At this point, the major flow features should be established. Scaled energy residual should decrease to 10-6 (for the pressure-based solver), and scaled species residual may need to decrease to 10-5 to achieve species balance. 3.6.2 Monitoring Quantitative Convergence Two important aspects of quantitative monitoring are: • •

Monitor other relevant key variables/physical quantities for a confirmation. Ensure that overall mass/heat/species conservation is satisfied.

In addition to residuals, you can also monitor lift, drag and moment coefficients and relevant variables or functions (e.g. surface integrals) at a boundary or any defined surface. Furthermore, in addition to monitoring residual and variable histories, you should also check for overall heat and mass balances. The net flux imbalance (shown in the GUI as Net Results) should be less than 1% of the smallest flux through the domain boundary. If solution monitors indicate that the solution is converged, but the solution is still changing or has a large mass/heat imbalance, this clearly indicates the solution is not yet converged. In this case, you need to reduce values of Convergence Criterion or disable Check Convergence in the Residual Monitors panel and continue iterations until the solution converges. 3.6.3 Norms of Convergence Error A more attractive way to estimate the iterative error is to use norms of the change in the solution from one iteration to the other36. The iterative error is related to the non-linearity of the system of partial differential equations solved in CFD. There are several sources of non-linearity in the RANS equations: • •

The convective terms. The usual linearization procedures are Picard or Newton methods, which imply an iterative solution. The turbulence closure. For example, one and two-equation eddy-viscosity models have nonlinear convective terms and non-linear production and dissipation terms. Also, the turbulence model equations are often solved segregated from the continuity and momentum equations.

Mike Kuron, M.S.M.E., Project Manager at CAE Associates L. Eça, M. Hoekstra,” On the Influence of the Iterative Error in the Numerical Uncertainty of Ship Viscous Flow Calculations”, 26th Symposium on Naval Hydrodynamics Rome, Italy, 17-22 September 2006 35 36

51

Furthermore, the linear system of algebraic equations obtained from the discretization of the linearized partial differential equations is rarely solved with a direct method. Therefore, the flow solution includes an extra iterative cycle corresponding to the method applied in the solution of the linear systems of equations. In most flow solvers, no clear distinction is made between the various iterative cycles. Therefore, in the estimation of the iterative error it is important to point out the meaning of one iteration of the solution procedure.

L  (φ) = M ax Δφi

1  i  Np

NP

L1 (φ) =

 Δφ i =1

NP

NP

i

,

L 2 (φ) =

 ( Δφ ) i

i =1

2

Eq. 3.3

NP

Where NP stands for the total number of nodes of a given grid and ∆ϕ for the local change of the flow quantity ϕ. Two options were used for ∆ϕ; the variable change between consecutive iterations, ∆ϕd =ϕn - ϕn-1, and the normalized residual of the discretized equations, ∆ϕr. In using these options, there are some important practical details. The difference in ϕ between iterations, ∆ϕd, is readily evaluated. However, it may be affected by the use of under-relaxation in the calculation procedure. If implicit under-relaxation schemes are applied, as for example local time-stepping, ∆ϕd will reflect its influence correctly. On the other hand, explicit under-relaxation must be handled carefully; ∆ϕd should be calculated before the under-relaxation is applied, otherwise, the values of ∆ϕd will become artificially small. The relation between the residual of the discretized equations and the flow quantities depends on the method adopted. Nevertheless, in general, the normalized residual, ∆ϕr, is equivalent to the differences in the solution of a Jacobi iteration for the system of equations of a given iteration. The term normalized means that the main diagonal of the system is scaled to one in order to obtain a right-hand side which represents a change in the dependent variable. Then ∆ϕr is also a measure of the differences between consecutive iterations. The values of L1, L2 and L∞ obtained in any iteration n that satisfies the selected convergence criteria may be used as iterative error estimators. However, there is no guarantee that these values bound4 the iterative error, especially when the rate of convergence is small. 3.6.4 Case Study – 2D Flow Over a Hill The calculations of the turbulent flow around the two-dimensional hill of 241x241 grid with Reynolds number of 6x104, were performed with the eddy viscosity one-equation model of Spalart & Allmaras. In each of the seven grids tested, the calculations were started from scratch copying the inlet profiles to the complete flow field. In this test case, we have chosen to monitor the behavior of the two Cartesian velocity components, U1 and U2 and the pressure coefficient, Cp. All the quantities presented are dimensionless using the hill height and the mean centerline velocity as the reference values. The reference pressure to compute Cp is the pressure at the outlet of the computational domain, i.e. Cp = 0 at the outlet. The convergence criterion is based on the maximum difference between consecutive iterations, L∞ (∆ϕd). Figure 3.18 presents the iterative error of U1 based on the solution converged to machine accuracy, in the finest grid. With the data shown it is easy to assess the quality of the iterative error estimators based on the last iteration performed. For the three levels of et plotted, the values of L2 (∆ (U1)d) of the last iteration performed are 10-4.3, 10-6.6 and 10-9.1. Iterative error estimations based only on the values of the last iteration performed are not reliable37. L. Eça, M. Hoekstra,” On the Influence of the Iterative Error in the Numerical Uncertainty of Ship Viscous Flow Calculations”, 26th Symposium on Naval Hydrodynamics Rome, Italy, 17-22 September 2006. 37

52

The L∞ is best suited for the iterative error estimation. The L2 norm is clearly worse, but still a better choice than the L1 norm. The results based on ∆ϕd are globally the most consistent. The most appropriate norm to perform iterative error estimation is the L∞ norm. The L2 and L1 norms do not The calculations of the turbulent flow around the two-dimensional hill of 241x241 grid with Reynolds number of 6x104, were performed with the eddy viscosity one-equation model of Spalart & Allmaras. In each of the seven grids tested, the calculations were started from scratch copying the inlet profiles to the complete flow field. In this test case, we have chosen to monitor the behavior of the two Cartesian velocity components, U1 and U2 and the pressure coefficient, Cp. All the quantities presented are dimensionless using the hill height and the mean centerline velocity as the reference values. The reference pressure to compute Cp is the pressure at the outlet of the computational domain, i.e. Cp = 0 at the outlet. The convergence criterion is based on the maximum difference between consecutive iterations, L∞ (∆ϕd). Figure 3.18 presents the iterative error of U1 based on the solution converged to machine Figure 3.18 Estimated Iteration Error of U1 for Different Level of accuracy, in the finest grid. Tolerance criteria et With the data shown it is easy to assess the quality of the iterative error estimators based on the last iteration performed. For the three levels of et plotted, the values of L2 (∆ (U1)d) of the last iteration performed are 10-4.3, 10-6.6 and 10-9.1. Iterative error estimations based only on the values of the last iteration performed are not reliable38. The L∞ is best suited for the iterative error estimation. The L2 norm is clearly worse, but still a better choice than the L1 norm. The results based on ∆ϕd are globally the most consistent. The most appropriate norm to perform iterative error estimation is the L∞ norm. The L2 and L1 norms do not lead to iterative error estimates which are representative of the complete computational domain. In most of the tested cases, the error estimators based on these two norms do not bound the iterative error obtained from the difference with the solution converged to machine accuracy. The locations where such estimates fail to bind the iterative error may cover a significant part of the computational domain. Estimates of the iterative error based only on the results of the last iteration, be it the change in the variables or the normalized residuals, are unreliable. In most cases, a major part of the computational domain exhibits iterative errors larger than these last iteration values. Indeed, the maximum values of the iterative error may be more than one order of magnitude larger than the differences between consecutive iterations or the normalized residuals of the last iteration. The L. Eça, M. Hoekstra,” On the Influence of the Iterative Error in the Numerical Uncertainty of Ship Viscous Flow Calculations”, 26th Symposium on Naval Hydrodynamics Rome, Italy, 17-22 September 2006. 38

53

extrapolation to an infinite number of iterations improves significantly the performance of the iterative error estimation. The least squares fit to a geometric progression seems to be a good option to make reliable estimates of the iterative error.

3.7

Computer Round off Errors

Any commutated solution, may be affected by rounding to a finite number of digits . In some 39

calculations, the magnitude of round-off errors is proportional to the number of grid points in domain. In these cases, refining the grid may decrease the truncation error but increase the roundoff error. When using single precision care needs to be taken to avoid round-off errors. In-viscid Euler simulations and simulations using wall-function meshes can most often is performed in single precision. For well resolved boundary layers with Y plus close to 1 it is often necessary to use double precision. If using double precision for very fine mesh resolutions make sure that you also create the mesh in double precision and not just run the solver in double precision. Sometimes a single precision solver converges slower than a double precision solver due to numerical errors caused by round-off errors. When using advanced physical models like combustion, free-surface simulations, spray and transient simulations with quick mesh motions it is also often necessary to use double precision. A notorious example is the fate of the Ariane rocket launched on June 4, 1996 (European Space Agency 1996). In the 37th second of flight, the inertial reference system attempted to convert a 64-bit floating point number to a 16-bit number, but instead triggered an overflow error which was interpreted by the guidance system as flight data, causing the rocket to veer off course and be destroyed.40 Roundoff error in a numerical method is error that is caused by using a discrete number of significant digits to represent real numbers on a computer. Since computers can retain a large number of digits in a computation, round-off error is problematic only when the approximation requires that the computer subtract two numbers that are nearly identical. This is exactly what happens if we apply an approximation to intervals that are too small. Thus, the effort to decrease truncation error can have the unintended consequence of introducing significant round-off error.

3.8

Truncation Errors

Truncation error represents the difference between the PDE and FDE and represented by order notation, as previously discussed. It is inversely related to the order of accuracy for the equations and would be extremely important criteria in accuracy of discretized equation as it is directly related to the stability consideration. Often, truncation error also includes discretization error, which is the error that arises from taking a finite number of steps in a computation to approximate an infinite process. The truncation error associated with 1-D heat equation can be expanded below (Eq. 21.4). It is obvious that better FDE approximation → smaller the Truncation error. But like many other things in CFD, there should be a balanced between FDE approximation and truncation error. For most practical applications, 2nd order accuracy would be sufficient41. Alternatively, truncation error in a numerical method is error that is caused by using simple approximations to represent exact mathematical formulas. The only way to completely avoid truncation error is to use exact calculations. However, truncation error can be reduced by applying the same approximation to a larger number of smaller intervals or by switching to a better approximation. Analysis of truncation error is the single most important source of information about the theoretical characteristics that distinguish better methods from poorer ones. With a combination

Anderson, Dale A; Tannehill, John C; Plecher Richard H; 1984:”Computational Fluid Mechanics and Heat Transfer”, Hemisphere Publishing Corporation. 40 Weisstein, Eric W. "Round off Error." From Math World - A Wolfram Web Resource. 41 Gerald Recktenwald, 2006. 39

54

of theoretical analysis and numerical experiments, it is possible to estimate truncation error accurately.

− unj ∂u ∂2 u un+1 α j −α 2 = − (unj+1 − 2unj + unj+1 ) + 2 ⏟ (∆x) ∂t ∂t ⏟ ∆t PDE

FDE

∂2 u ∆t ∂4 u (∆x)2 + . . . .] [{− 2 } + {α 4 } ∂t 12 ⏟ ∂t 2 T.E.

Eq. 3.4

Practitioners of numerical approximation are most concerned with truncation error, but they also try to restrict their efforts at decreasing truncation error to improvements that do not introduce significant round-off error. Here, we consider only truncation error. We seek information about error on both a local and global scale. Local truncation error is the amount of truncation error that occurs in one step of a numerical approximation. Global truncation error is the amount of truncation error that accumulates in the use of a numerical approximation to solve a problem.

3.9

Code Errors

Errors related to bugs in the code used or mistakes made by the Programmer. A general methodology called method of Manufactured Solutions (MS) has been proposed to circumvent the issue. As a physically realistic solution is not needed, the code verification being purely a mathematical assessment activity, not a physical one, one can suppose an arbitrary and analytic solution field. Then, with a mathematical derivation, where symbolic mathematical systems can help, the unknowns are replaced in the differential terms with the assumed solution. The obtained result, which will be different than zero, corresponds to a source term field for the original governing equations and a set of boundary conditions. This source term field will have a complicated but analytical expression and can be set in the simulation code. With this setup the simulation is launched. As the exact solution is known, it is possible to compute an error with an appropriate norm (typically L2 norm) which is the difference between the numerical result and the exact solution.

3.10 Benchmarking & Inter-Code Issues The CFD benchmarking project is a large collection of CFD benchmark configurations that are known from literature42. It stems from the need to compare the results of a CFD software not only by the "picture norm" (i.e. looking at the produced pictures and saying, "Oh, that software works quite accurate"), but to compare hard numerical numbers with reference values which were commonly accepted by the CFD community. The benchmark problems here will mainly be designed in 2D and aim to give a deeper understanding of how mathematical methods work in practice. This will hopefully also initiate further discussions about the reasonability of one or the other method. Of course we try to give references to literature wherever possible. There will also be section with "minibenchmarks", which contains very basic tests (sometimes even with analytical results). This section should serve as a reference to give developers of CFD codes a base for testing different solver components for correctness. A common issue that arises in CFD is the validation and testing of the code to be used for a computation. According to CFD on Line, some of the better known cases for validation purposes are:

42

Lehrstuhl III, Angewandte Mathematik und Numerik Technische Universität, Dortmund, Germany.

55

➢ 1D test cases • Shock Tube Problem ➢ 2D test cases • 2D Vortex in Isentropic Flow • 2D Riemann Problem • 2D Laminar/Turbulent Driven Square Cavity Flow • Circular Advection • Explosion Test in 2D • Lid-driven Cavity Problem • Jeffery-Hamel Flow • Flow Over Backward Facing Step (Laminar – Turbulent) • Flow Around a Circular Cylinder • Flow Across a Square Cylinder • NACA 0012 Airfoil • RAE 2822 Airfoil • Ringleb Flow • Scramjet Intake • Suddhoo-Hall Airfoil • Turbulent Flat-Plate • Viscous diffusion of multiple vortex system

• • • • • •

Williams Airfoil 2D Ramp in Channel Problem 2D Single Mode Rayleigh-Taylor Instability 2D Single Mode Richtmyer-Meshkov Instability 2D Mach 3 Wind Tunnel With a Step Gresho Vortex

➢ 3D test cases • Ahmed Body • Flow in the 180 degree U-Bend Square Duct • DARPA SUBOFF Model • Hypersonic Blunt Body Flow • ONERA M6 Wing • Turbomachinery • Eckardt Centrifugal Compressor • NASA Rotor 37 for Axial Rotors • NASA Rotor 67 for Axial Fans • 3-D Single Mode Rayleigh-Taylor Instability • 3-D Single Mode Richtmyer-Meshkov Instability • Free-Surface Piercing NACA 0024 Hydrofoil ➢ Transition test cases • 2D Cascade

56

3.10.1 Case Study 1 – Results of M6 Wing using NASA Codes of the Same Grid As an 3D example, consider M6 wing for comparing Aerodynamic quantities using NASA codes. The flow conditions are set to M = 0.82, α = 5 degrees, and Re = 107, with algebraic turbulence model used. Table 3.2 shows a picture norm of the results.

Aerodynamic Quantities CFL3D TLNS3D ENSAERO ADPAC OVERFLOW Table 3.2

Lift Coefficient

Drag Coefficient

Normal Forces Coefficient

0.5256 0.5254 0.5212 0.5217 0.5267

0.04097 0.04290 0.04230 0.03453 0.04422

0.5272 0.5271 0.5230 0.5227 0.5285

NASA Code Comparisons for Surface Forces in M6 Wing

3.10.2 Case Study 2 - Grid Convergence for 3D Benchmark Turbulent Flows Grid convergence studies are performed by [Diskin et al.]43 to establish reference solutions for benchmark 3D turbulent flows in support of the ongoing turbulence model verification and validation effort NASA. The benchmark cases are a subsonic flow around a hemisphere cylinder and a transonic flow around the ONERA M6 wing with a sharp trailing edge. The study applies widelyused CFD codes developed and supported at the NASA Langley Research Center, namely, FUN3D, USM3D, and CFL3D. A description of the code available in44. Reference steady-state solutions are computed for the RANS in conjunction with the Spalart-Allmaras turbulence model on families of consistently-refined grids composed of different types of cells. Coarse-to-fine and code-to-code solution variation is described in detail. For further details, readers should consult the work by45. 3.10.2.1 Subsonic Flow around a Hemisphere Cylinder Five grid families are generated for this study. Unstructured grids of families 1 to 4 have triangular faces on the hemisphere surface and no polar singularity. Each family has four levels of nested grids; L1 is the finest grid level, L2 is the second finest grid level, etc. Statistics of grids from families 2 (Tet), 4 (prism/hex), and 5 (structure) are shown in Table 3.3. These grids have a polar singularity at the axis attached to the apex of the hemisphere, i.e., along this polar axis, hexahedral cells degenerate into prismatic cells. Unstructured grids corresponding to the same level have the same distribution of grid nodes. In comparison with unstructured grids at the same level, family 5 (structure) grids have the same number of surface elements on the hemisphere surface, the same distribution of nodes on the cylinder surface, and more nodes on the hemisphere surface. All the L1 grids have the nearsurface normal spacing approximately corresponding to y+ = 0.5. FUN3D solutions are computed on grids of families 2 (Tet) and 4 (prism/hex); SFE solutions are computed on family 2 (Tet) grids,

B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 44 See Previous. 45 See Previous. 43

57

USM3D solutions are computed on grids of families 2 (tet) and 5 (structure), and CFL3D solutions are computed on family 5 (structure) grids.

Table 3.3

Statistics of four finest grids for hemisphere cylinder grid families (Courtesy of [Diskin et al.])

3.10.2.2 Geometry, Flow Parameters, and Boundary Conditions The geometry is taken from the experimental study reported by [Tsieh]46. In the experiment, the radius of the hemisphere was 0.5 in., the body length was 10 in., and the unit Reynolds number per foot was 4.2x106. Thus, in the computational domain with the unit length taken as 1 in., the hemisphere radius is 0.5, the cylinder length is 9.5, and the Reynolds number is Re = 3.5 x 10 5 per unit length. The reference solutions have been computed at the following flow conditions: the reference Mach number Mref = 0.6, angles of attack of 0, 5, 10, 15, and 19 degrees, and the reference temperature Tref = 540 degree R. Here, presents only solutions corresponding to the 19 degrees angle of attack. The origin of the coordinate system is located at the apex of the hemisphere. The positive x direction is the stream wise direction collinear with the axis of the hemisphere and cylinder. Figure 3.19 sketches the layouts of boundary conditions and shows the global view of a computational grid

Figure 3.19

Global View of and Boundary Conditions (Courtesy of [Diskin et al.])

Tsieh, T., “An Investigation of Separated Flow About a Hemisphere Cylinder at 0 to 19 Degrees Incidence in the Mach Number Range from 0.6 to 1.5," AEDC-TR-76-112, 1976. 46

58

with half-plane symmetry. The downstream computational boundary is located at the back of the cylinder, x = 10. The outflow conditions specified at the downstream boundary are constant pressure conditions corresponding to P = Pref = 1. The far field boundary is a hemisphere with radius of 100 units centered at x = 10; y = 0; z = 0. 3.10.2.3 Results for Hemisphere Cylinder For solution visualization, Figure 3.20 presents the FUN3D solution computed on the prism/hex L1 grid. The pressure contours and streamlines are shown in two planes corresponding to y = 0, and x = 6.0. The pressure is non-dimensional by ρref a2ref , where ρref and aref are the dimensional freestream density and speed of sound, respectively. In the symmetry plane corresponding to y = 0, the crossstream separation is characterized by downward ow velocity. The separation occurs behind the hemisphere-cylinder junction and continues for the entire cylinder length. A minimum pressure is observed on the leeside, upstream of the hemisphere-cylinder junction. A large primal vortex and a smaller secondary vortex are shown in the cross flow planes corresponding to x = 6.0. The separation locations of these primal and secondary vortices are similar to those documented in the experiment. An off-body vortex is seen in the shear layer of the primal vortex, outboard of the secondary one.

Figure 3.20

Global View of Hemisphere Cylinder Pressure Contours using L1 grid at surfaces y = 0 (left) and x = 6 (right); (Courtesy of [Diskin et al.])

3.10.2.4 Forces and Pitching Moment Grid convergence plots of the lift, total drag (including pressure and viscous components), and pitching moment coefficients and the maximum eddy viscosity are shown in Figure 3.21. The value of the characteristic grid spacing, h, is computed as h = N-1/3, where N is the number of degrees of freedom (cells for USM3D and CFL3D, nodes for FUNFV47 and SFE). The aerodynamic coefficients computed with different codes on different grid families are generally converging to the same limit with grid refinement. Convergence of the maximum eddy viscosity is less clear, mainly because of the disagreement between limit projections from FUNFV (prism/hex) solutions and other solutions, even though the SFE and FUNFV (prism/hex) solutions agree well on the finest L1 grids. Overall codeto-code aerodynamic coefficient variation from the L4 grids to the L1 grids is up to 20%. In this

Two different discretization available in FUN3D are employed: the baseline finite-volume discretization (FUNFV) and a recently implemented stabilized finite-element discretization (SFE) based on a Streamlined Upwind Petrov-Galerkin formulation. 47

59

estimate and in the rest of the paper, relative variation is computed with respect to the middle of the variation range. Extrapolation to the infinite-grid limit is problematic because no reliable order of convergence can be established. No solution appears to converge uniformly in all quantities. Three solutions, USM3D (structure), FUNFV(prism/hex) and SFE, converge monotonically. Considering lift, USM3D (structure) solutions show less than first-order convergence, i.e., the lift approaches the limit from above with a concave shape. The FUNFV (prism/hex) and SFE lift curves approach the limit from above with convex shapes, indicating a convergence order that is higher than first order. The FUNFV (tet) lift appears to converge with first order on the three finer grids. The USM3D (tet) lift converges Table 3.4 Hemisphere Cylinder: Variation of Aerodynamic from above and changes the Coefficients on L1 Grids – (Courtesy of [Diskin et al.]) curve shape from concave to convex. Considering pressure drag convergence, the FUNFV (prism/hex) and SFE convergence curves approach the limit with convex shapes from above, but intersect. Lacking an exact solution, we use a quantitative characterization of observed solution variation to evaluate accuracy. Variation of the aerodynamic coefficients computed on the L1 grids is described in Table 3.4. The largest relative difference among all solutions is observed for the pitching moment and does not exceed 4.4%. Accuracy of aerodynamic coefficients improves proportionally to degrees of freedom used in CFD computations. This property is the foundation of all grid refinement studies. It also justifies the expectation of accuracy benefits from tetrahedral-grid cell-centered formulations that provide more degrees of freedom on grids of the same level. The USM3D solutions use about six times more degrees of freedom on grids of family 2 (tet) than other solutions on grids of the same level. Because grid convergence shown in Table 3.4 is not regular, quantitative assessments of accuracy improvements due to additional degrees of freedom are difficult and imprecise. Qualitatively, the aerodynamic coefficients computed by USM3D(tet) on the L2 grid are within the variation range of the L1 solutions. Looking at the grid convergence on the three finer grids, the maximum and minimum values of integrated aerodynamic quantities have been generally exhibited by the CFL3D solutions and the FUNFV (tet) solutions. (The only exception is that, for the viscous drag coefficient, the minimum is exhibited by the SFE solutions). Relative variation among the core-group L1 solutions is also shown in Table 3.4. The deviations of the CFL3D solutions from the core-group solutions may be attributed to the thinlayer approximation. The abnormalities in the FUNFV (tet) solutions observed on the current grids are harder to explain. In the limit of grid refinement, all FUNFV and SFE solutions are expected to converge to the same “infinite-grid" solution. On the current grids, nonphysical oscillatory solution modes resembling checker-board instabilities were observed in the FUNFV (tet) solutions with the default MUSCL scheme coefficient, κ = 0.0. The FUNFV solutions computed on grids of other families are smooth. Note that the default value of the MUSCL scheme coefficient on non-tetrahedral grids is κ= 0.5. In this study, an increased coefficient of κ = 0.75 is used for FUNFV (tet) solutions. Solutions with κ = 0.75 do not exhibit nonphysical oscillations, but appear to be somewhat less accurate. Although not shown, FUNFV solutions with κ = 0.0 were computed on tetrahedral grids by using the

60

approximate mapping discretization method for inviscid fluxes48-49. Approximate-mapping solutions do not exhibit nonphysical oscillations and provide aerodynamic coefficients well within the coregroup variation range.

Figure 3.21

Grid Convergence of Aerodynamic Forces for Hemisphere Cylinder (Courtesy of [Diskin et al.])

3.10.2.5 Fine Grid Surface Pressure, Skin Friction, and off-body variation In this section, surface pressure and skin friction are shown for four sets of solutions: USM3D (tet), FUNFV(prism/hex), SFE, and CFL3D solutions. Hereafter, only these four hemisphere-cylinder solutions of the available set are shown mainly for conciseness and presentation clarity. First, global

Diskin, B. and Thomas, J. L., “Comparison of Node-Centered and Cell-Centered Unstructured Finite-Volume Discretizations: Inviscid Fluxes," AIAA J., Vol. 49, No. 4, 2011, pp. 836-854. 49 Diskin, B. and Thomas, J. L., “Erratum: Comparison of Node-Centered and Cell-Centered Unstructured FiniteVolume Discretizations: Inviscid Fluxes," AIAA J., Vol. 51, No. 1, 2013, pp. 277. 48

61

views of solution variation on the L1 grids are shown. Figure 3.22 displays the surface pressure and the x-component of skin friction at the symmetry plane corresponding to y = 0. The pressure maximum indicating the leading edge stagnation is located near x = 0.03 on the windward side of the hemisphere. The pressure minima are observed on the leeside near x = 0.3 and on the windward side near x = 0.45. A zone of low pressure is also observed on the leeside at x > 4. Near the outflow boundary, the leeside pressure increases and the windward pressure decreases, creating a small negative-lift zone. The L1-grid surface pressure distributions computed with different codes are almost indistinguishable. The fine-grid code-to-code differences in the surface pressure at the local extrema located on the hemisphere are within 0.5%. Although comparisons with experimental data are not the focus of this paper, the computed surface pressure agrees qualitatively with the experimental measurements. Figure 3.22-(b) shows the x-component of the skin friction vector. Note that the y and z components of the skin friction are zero at the symmetry plane. The skin-friction profiles computed with different codes are similar in most places. The largest discrepancy of about 15% is observed on the leeside, near x = 6.5. Other places of noticeable but more local discrepancies are downstream from the hemisphere-cylinder junction at x = 0.5 and near the outflow are nondimensionalized by aref . The solutions are plotted along a vertical line attached to the upper surface of the cylinder at x = 5 and y = 0.21. The view is chosen to show solution variation across the core of the primal crossflow vortex located in this area. All off-body profiles are over plotted in the global view.

Figure 3.22

Global View of Surface Pressure and Skin Friction at symmetry plane (y = 0) for Hemisphere Cylinder – (Courtesy of [Diskin et al.])

3.10.2.6 Effect of Grid Refinement on Surface Pressure and Skin Friction Grid refinement and zooming have been applied to study solution variation near nontrivial flow features and near regions of the largest solution differences observed in global views. Although not shown, global views of the surface pressure profiles in the three planes (y = 0, z = 0, and x = 5) computed on four grids in corresponding families are hardly distinguishable. The results in [Diskin et al.]50 illustrates local grid convergence of the leeside surface pressure near the location of the minimum pressure. Even in the zoomed view, only the coarsest L4 grid solutions are clearly B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 50

62

distinguishable. All codes appear to approach the same limit. Variation between L1 solutions is less than 0.07%. The minimum pressure coefficients computed with USM3D, FUNFV, and CFL3D decrease monotonically with grid refinement and show similar coarse-to-fine grid variation of about 7%. The SFE minimum pressure coefficients show remarkably small coarse-to-fine grid variation of less than 0.5%, but converge non-monotonically with grid refinement. For complete discussion, please consult the work by [Diskin et al.]51. 3.10.2.7 Transonic Flow Around an M6 Wing The ONERA M6 experiment has been widely used for validation of CFD solvers52. A relatively simple, well-documented geometry and a rich experimental database for a large variety of ow conditions provide a unique combination for practical and inexpensive benchmark studies. Reference solutions for transonic flows around the M6 wing are presented in this section. Authors believe that the solutions computed on grids with more than 360 million degrees of freedom represent the largest M6 computations conducted to date. The grid generation, coarsening, partition, and multigrid capabilities for the M6 model are described in an accompanying paper53. The M6 wing geometry used in this study has been slightly redefined for numerical analysis of turbulence model simulations. 3.10.2.8 Geometry, Flow Parameters and Boundary Conditions Recently, a group at ONERA has considered the M6 model and its past experiments in greater detail5455. As part of this effort, the group has created a new CAD geometry for the wing. In this geometry, the trailing edge of the wing has been made sharp for the purpose of this particular CFD exercise. The reference solutions for the OM6 wing are computed at a freestream Mach number 0.84, Reynolds number 14.6 x 106 based on the unit root chord, and the angle of attack of 3.06 degrees. The far-field boundary in the shape of a hemisphere is located at 100 unit chords. The symmetry condition is assigned at the plane containing the root airfoil. Note that the experiment used a splitter plate near the wing root, which is not modelled by CFD codes. This discrepancy is believed to be the cause of disagreement between CFD solutions and experiment measurements at inboard sections. 3.10.2.9 Grids for M6 Wing The M6 grids used in this study are topologically equivalent to the full-geometry (y = 0 symmetry plane) hemisphere-cylinder grids described before. The cylinder surface is mapped on the wing surface with the specified wing section, and the hemisphere surface is mapped on the rounded wing tip. Five nested grid families have been generated for the M6 geometry by using input profiles available at the TMR website. Statistics of L4 - L1 grids from families 1 (prism), 2 (tet), 4 (prism/hex) and 5 (structure) are shown in Table 3.5. The far field boundary grids are not shown because they look similar to the full-geometry extension of those for the hemisphere-cylinder configuration. The surface grids have a moderate stretching toward the leading and trailing edges resulting in a relatively coarse grid spacing in the mid-chord region. All the L1 grids have the first node off the surface located at an average of approximately y+ = 0.5.

See Previous. Schmitt, V. and Charpin, F., “Pressure Distribution on the ONERA-M6-Wing at Transonic Mach Numbers," In Experimental Data Base for Computer Program Assessment. Report of the Fluid Dynamics Panel Working Group 04, AGARD AR 138, 1979. 53 Nishikawa, H. and Diskin, B., “Customized Grid Generation and Processing for Benchmark Three-Dimensional Flows," SciTech-2018, Kissimmee, FL, Jan., 2018, To be published as AIAA Paper. 54 Gleize, V., Dumont, A., Mayeur, J., and Destarac, D., “RANS simulations on TMR test cases and M6 wing with the ONERA elsA flow solver (Invited)," AIAA Paper 2015-1745, 2015. 55 Mayeur, J., Dumont, A., Gleize, V., and Destarac, D., “RANS simulations on TMR 3D test cases with the ONERA elsAfl ow solver," AIAA Paper 2016-1357, 2016. 51 52

63

Table 3.5

Statistics of Grids for OM6 Wing Grid Families

3.10.2.10 Results for M6 Wing Figure 3.23 presents the contours of the surface pressure computed by USM3D on the prism/hex L1 grid of family 4. The pressure is non-dimensionalized by ρref a2ref. A lambda shock is clearly visible on the surface with the shock intersection located at about 80% of the wingspan. Grid convergence of aerodynamic coefficients is described next. USM3D solutions have been computed on grids of families 2 (tet) and 4 (prism/hex); FUNFV solutions have been computed on grids of families 1 (prism) and 4 (prism/hex); and CFL3D solutions have been computed on structured grids of family 5 (structure). All computations have been conducted with no flux limiters. Figure 3.24 (a)-(b) show convergence of the lift, total drag. No solution converges monotonically for all plotted quantities; thus, no order property can be deduced from the observed convergence. Nevertheless, all solutions approach the same Figure 3.23 M6 wing: pressure contours computed by USM3D aerodynamic coefficient values in on family 4 prism/hex L1 grid - (Courtesy of [Diskin et al.]) the limit of grid refinement. The slopes of pitching moment convergence curves shown in Figure 3.25 are highly irregular for solutions on grid families 4 (prism/hex) and 5 (structure). For example, the pitching moment coefficient computed from the family 4 USM3D (prism/hex) solutions decreases initially with grid refinement from L4 grid to L3 grid, increases on L2 grid, and decreases again on L1 grid. Lift and pitching moment convergence observed for FUNFV (prism) and USM3D (tet) solutions is more regular. The differences among lift and pitching-moment coefficients computed by all codes on all grids do not exceed 6%. Drag coefficients appear to be converging with more regular slopes, but do not provide convergence patterns suitable for the infinite-grid extrapolation. The total and pressure drag coefficients computed from CFL3D and FUNFV solutions change the direction of convergence on the L1 grids. The viscous drag coefficient computed from the USM3D (tet) solution changes the direction of convergence on the L1 grid. Only USM3D (prism/hex) solutions converge monotonically for the three

64

drag coefficients. Relative variation of drag coefficients computed on different grids is more significant than variation of the lift and pitching moment coefficients; pressure and viscous drag coarse-to-_ne variation is approximately 30% and 16%, respectively. To establish solution accuracy, Table 3.6 shows code-to-code variation of the forces, pitching moment, and maximum eddy viscosity on the L1 grid. Among all integral aerodynamic coefficients, the maximum relative difference of 0.94% is observed for the pressure drag. Maximum eddy viscosity variation exceeds 10%, indicating considerably higher uncertainty than in integrated quantities. For quantities that converge regularly in grid refinement, e.g., lift (Figure 3.24-(a)) and pitching moment (Figure 3.25), the USM3D (tet) solutions appear to provide significant accuracy benefits on same-level grids. Variation of surface pressure coefficients

Table 3.6

Figure 3.24

M6 Grid Convergence of Aerodynamic Forces CL, CD

Variations of Aerodynamic Coefficients - (Courtesy of [Diskin et al.])

65

computed on the L1 grids at the measurement sections used in the 9 of 35 experiment is shown in [Diskin et al.]56. Only three computations, FUNFV (prism/hex), USM3D (prism/hex), and CFL3D (structure), are used in this section for succinctness. The three codes extract surface pressure at the same span wise locations specified57. In the global view, the L1 pressure profiles from different codes are in close agreement. Small oscillations in FUNFV solutions are observed near the shocks. All solutions place shocks at the same locations and identify the same pressure minima on the lower and upper wing surfaces. The pressure profiles at leading and trailing edges are indistinguishable. As compared to other studies, an improved agreement with the experiment is observed at section 4. This improvement is observed to be due to the increased grid resolution provided by the L1 grids. Figure 3.26 show a global view of leeside pressure grid refinement at sections 1 (η = 0.2). The pressure plots show significant variation with grid refinement. The mid chord grid spacing on L4 and L3 Figure 3.25 M6 Grid Convergence of Pitching Moment grids is too coarse to represent details of the pressure profiles; the corresponding coarse-grid solutions miss most of the shock structure and are significantly different from the solutions obtained on the fine grids. All solutions computed on L2 and L1 grids represent the shock details and agree to each other remarkably well. The grid convergence patterns of USM3D and CFL3D solutions are quite similar, as expected, because both codes use cell-centered formulations. FUNFV uses a node-centered formulation and exhibits a different convergence pattern. All codes identify the minimum of pressure at the same location58, η≈ 0.39. The code-to-code discrepancy in the minimum-pressure value is about 0.09%. Minimum pressure computed from all solutions converges monotonically with grid refinement and demonstrates at least a second-order convergence rate. Most of L4 and L3 solutions completely miss the double-shock structure in this region. Only the FUNFV L3 solution indicates a presence of a shock structure; USM3D and CFL3D L3 solutions miss it. However, all the L1 solutions predict a double-shock structure in this region and agree well with each other. All the L1 solutions predict a pressure plateau between two shocks at 0.3 < η < 0.35. The normalized x-direction grid spacing at this location is Δx/c ≈ 0.02, providing just four grid nodes across the plateau. In spite of the minimal grid resolution, the maximum code-to-code difference between pressure values on this plateau is less than 6%. 3.10.2.11

Concluding Remarks

B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 57 The wingspan is taken as b = 1.47601797621980 and the relative axial position was computed as x/c = (x xmin)/(xmax - xmin). 58 η = x/c. 56

66

Detailed grid-convergence studies for two benchmark (3D) flows have been conducted by [Diskin et al.]59 to establish reference solutions for Reynolds-Averaged Navier-Stokes (RANS) equations using a Spalart-Allmaras turbulence model. The benchmark flows are a subsonic flow around a hemisphere cylinder and a transonic flow around the ONERA M6 wing (M6) with a sharp trailing edge. The reference solutions have been computed with three widely used CFD codes developed at NASA Langley, FUN3D, USM3D, and CFL3D. The codes use different discretization approaches and iterative solution methods. Two different unstructuredgrid second-order node-centered discretization available in FUN3D are used for the hemisphere-cylinder computations: the FUNFV discretization uses a standard finite-volume scheme and the SFE discretization uses a recently added stabilized finite-element formulation. SFE is not used for M6 computations. USM3D uses an unstructured-grid second-order cellcentered finite-volume formulation. CFL3D uses a second-order cell-centered structured-grid formulation. Five families of consistently-refined nested grids of different topology have been generated for the studies, including both structured grids and unstructured grids with various types of elements. The finest family grids provide from 60 M to over 400 M degrees of freedom. To eliminate iterative errors, all solutions on all grids have converged to near machine-zero residual levels. Although turbulence model validation is not the focus of the paper, the reference solutions have been compared with available experimental data. The main push is assessing variation between CFD solutions computed with different codes on different families of consistently-redefined grids. All codes show close agreement in predicting aerodynamic coefficients for the separated flow around the hemisphere-cylinder configuration. The code-to-code discrepancy among all Figure 3.26 M6 section 1 (η = x/c = 0.2) View of leeside aerodynamic coefficients computed on Pressure Grid Refinement - (Courtesy of [Diskin et al.]) the _nest family grids is less than 4.5% B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 59

67

and variation among a core group of four solutions is less than 0.75%. The coefficients appear to converge to the same limit with grid refinement, but no convergence order can be discerned for the observed convergence. There is more uncertainty about the grid convergence limit of the maximum eddy viscosity. The surface pressure and skin friction in different _ne-grid solutions over plot in most global views. A local disagreement among the codes of about 15% is observed in the vicinity of the leeside just past the middle section of the cylinder. Various off-body solution components probed outside of this region also over plot. Local solution characteristics, such as surface pressure minima and the circumferential angle of vortex separation and reattachment locations, also converge to the same limit with grid refinement. The reference solutions compare well with available experimental data. The reference solutions for a transonic flow around M6 have been computed using the three formulations, FUNFV, USM3D, and CFL3D. The aerodynamic coefficients computed by different codes on the finest grids of different families agree well; the maximum difference among all coefficients does not exceed 0.73%. The difference in maximum eddy viscosity is 10.3%, which is much larger than the corresponding difference in the aerodynamic coefficients. The surface pressure computed with the three codes have been compared at seven OM6 wing sections. The pressure profiles computed on the finest grids over plot in the global views. Away from shocks, all the pressure profiles computed on the two finest grids are close to each other, within a 1-2% range. Increased grid resolution allows for an improved resolution of the lambda-shock feature that was a challenge in past M6 computations. As compared to previous studies available in the literature, the present solutions on the finest grids provide an improved agreement with the experiment. Further details is available in [Diskin et al.]60. 3.11 Usage Errors Usage errors are due to the application of the code in a less-than-accurate or improper manner. Usage errors may actually show up as modeling and discretization errors. The user sets the models, grid, algorithm, and inputs used in a simulation, which then establishes the accuracy of the simulation. There may be blatant errors, such as attempting to compute a known turbulent flow with an assumption of inviscid flow. A converged solution may be obtained; however, the conclusions drawn from the simulation may be incorrect. The errors may not be as evident, such as proper choice of turbulence model parameters for separated flows with shocks. The potential for usage errors increases with an increased level of options available in a CFD code. Usage errors are minimized through proper training and the accumulation of experience. A good source of information are obtained by [Melot, et al,]61 and [Roy, et al,]62.

3.12 What to trust and what not to?

CFD is generally quite good at predicting surface static pressure distributions. With care CFD can also be used to predict performance, total-pressure losses and blade turning. Predicting separation, stall and off-design performance can be a challenge and results with non-attached flows should be interpreted with care. Heat transfer is often very difficult to predict accurately and it is common to obtain heat-transfer coefficients that are 100% wrong or more. Validation data is critical in order to be able to trust heat transfer simulations. Transition is almost impossible to predict accurately in general. However, there exist models that have been tuned to predict transition and these tend to B. Diskin, W. K. Andersony, M. J. Pandyaz, C. L. Rumseyx, J. L. Thomas , Yi Liuk, and H. Nishikawa, “Grid Convergence for Three Dimensional Benchmark Turbulent Flows”, AIAA Aerospace Sciences Meeting, 2018. 61 Matthieu Melot, Bernd Nennemann and Claire Deschenes, ”Verification of transport equations in a general purpose commercial CFD code”, 28th IAHR symposium on Hydraulic Machinery and Systems (IAHR2016). 62 C. J. Roy,1, C. C. Nelson, T. M. Smith, and C. C. Ober, ”Verification of Euler / Navier-Stokes Codes using the Method of Manufactured Solutions”, International Journal for Numerical Methods in Fluids, 2004. 60

68

give acceptable results for cases close to the ones they were tuned for. In general, time permitting, to contain the numerical errors associated with calculation of cell gradients and face interpolations, we must63 : • • • • • • • •

Use higher-order discretization schemes (second-order upwind, MUSCL, etc.) Attempt to align grid with the flow to minimize the “false diffusion” Refine the mesh – do a mesh-independent study Sufficient mesh density is necessary to resolve salient features of flow Interpolation errors decrease with decreasing cell size Minimize variations in cell size in non-uniform meshes (mesh quality) Truncation error is minimized in a uniform mesh Minimize cell skewness and aspect ratio (mesh quality)

3.13 Verification and Validation for Computational Simulation

There are inherent inaccuracies in any numerical simulation of continuum problems due to discretization of the domain. Depending on the size of discrete elements used, numerical instabilities and uncertainties are introduced. To identify and quantify the main source of these uncertainties, sets of verification and validation procedures are employed. While the terms are often used interchangeably, verification and validation are distinct. Verification is the assessment of the accuracy of the solution by comparing to known solutions; validation is the assessment of accuracy of simulation with benchmark experimental data. A more compact explanation is provided by (Roache - 1997), with “verification defined as solving the equations right”, and “validation is solving the right equations”. In short, verification deals with the mathematics; where validation deals with the physics. To minimize the uncertainties and error within a numerical simulation, sets of parametric studies and comparisons are developed. These, are simple and yet time consuming and tedious, especially for real applications. To minimize the effort, users are advised to consult with guidelines developed by ITTC, ASME, or Journal of Fluids Engineering before instigating a numerical simulation. Journal of Fluid Engineering guideline is depicted here as reference.

63

CFD Online forums, Convergence.

69

70

4 CFD in Biomedical Applications CFD is still emerging in biomedical application due to the complexity of human anatomy and human body fluid behavior64. Nevertheless, it is becoming more accessible and practicable by virtue of the advent of digital computer with high performance hardware and software. Since the importance of knowledge of body fluids and system components are expected to perform and bio-fluid physiology study has been growing over the last several years, the advancement of biomedical practices and technology has been stimulated. The biomedical research with the aid of CFD software is still emerging which incorporated the physiology and pathophysiology of cardiovascular system and respiratory system through simulation. Various researches of simulation and clinical results had been studied, particularly the analyses of blood flow and nasal airflow. In most researches, the blood flow analysis studied the circulation of blood of ventricle function, coronary artery and heart valves. Meanwhile, the nasal airflow analysis studied the basic airflow in human nose, drug delivery improvement and virtual surgery. The examples of CFD simulations applied in cardiovascular and respiratory systems are depicted as in Figure 4.1 (A-B), respectively. CFD modelling has already received tremendous attention from biomedical researches along with the development of medical devices. Furthermore, detailed characterization of complex physiology and the measurement of computation metrics can be determined by incorporating both imaging procedure and CFD simulation65. CFD models are continuously being interpreted into clinical tools for physicians to apply across the spectrum of various diseases of cardiovascular and respiratory systems. Therefore, this paper explores the CFD study using the state-of-the-art in clinical area, highlighting the biomedical applications. CFD plays an important role by offering chances for simulation prior to undertaking real commitment to develop medical interventions in the correct direction and to execute any medical design alteration. The researches of biomedical CFD applications received tremendous attention in the past few years due to the importance of computational medical simulations of circulatory functions. The (B) Respiratory system

(A) Cardiovascular system Figure 4.1

Example of CFD simulations in cardiovascular and respiratory systems

Ernnie Illyani Basri, Adi Azriff Basri, Vizy Nazira Riazuddin, Siti Farhana Shahwir, Mohammad Zuber, Kamarul Arifin Ahmad, “Computational Fluid Dynamics Study in Biomedical Applications: A Review”, International Journal of Fluids and Heat Transfer, Volume 1 Issue 2 June 2016. 65 P.D. Morris, A. Narracott, H. Von Tengg - kobligk, D. Alejandro, S. Soto, S. Hsiao, et al., “Computational fluid dynamics modelling in cardiovascular medicine”, (2015) 1–11. doi:10.1136/heartjnl-2015-308044. 64

71

biomedical CFD applications for cardiovascular and respiratory systems are discussed in the subsequent sub-sections.

4.1 Literature Survey in Biomedical CFD 4.1.1 Cardiovascular Systems The adoption of CFD directed towards theoretically highly beneficial within cardiovascular medicine, clinical trials, improving diagnostic assessment and device design in order to predict physiological responses to intervention and compute the prior hemodynamics parameters that unable to be measured. Researches of CFD applications in regards to cardiovascular system are addressed the associated methodological, analytical assessment and result of three main physiologies of heart functions namely valves, arteries and ventricle. Cardiovascular is pertaining to the heart disease which is the major cause of death around the world. Heart valve disease is the common disease which causes by the narrowing of aortic valve or leaking of blood flow on the valve leaflet. Recent study, [Basri et al.]66 studied the hemodynamic properties of the effect different valve opening for 45˚, 62.5˚ and fully opening by using the combination of magnetic resonance imaging (MRI) and CFD simulation. The authors investigated the hemodynamic properties in terms of pressure, velocity and wall shear stress to determine blood behavior of severed aortic stenosis. The result shows the significant decrease of blood pressure on the small valve opening, which caused the obstruction of blood ejection due to narrowing of valve. Hence, the study found that the lower leaflet opening shown detrimental effect on blood flow and induced higher stress on the leaflets. Besides that, [Basri et al.] compared the normal aortic valve (fully open) and stenosis aortic valve (62.5˚ opening) through the study of hemodynamic properties. The authors used CFD simulation on a 3D aortic valve which imported from MRI data scan. The study observed an increased velocity by 13.7% and a reduced of 2.9% in the mass of blood entering at the aortic branches of stenosis aortic valve compared to normal aortic valve. Thus, the study proved a significant reduction of blood supply to provide blood to head, neck and arm of human body. Meanwhile, [Tan et al.]67 studied on a patient specific assessment of stenosis aortic valve and compared the aortic flow pattern before and after deploying trans catheter aortic valve. The authors carried out CFD simulation that incorporated MRI data scan to investigate the flow patterns of thoracic aortas in terms of velocity profile and wall shear stress. The result of flow patterns shows a reduction of 20% of jet flow at an instantaneous velocity streamlines and a lower time-averaged wall shear stress after implantation. Hence, the combination of imaging and simulation approach in this study led to an individual evaluation of the disturbed blood flow patterns and wall shear stress on the aorta before and after undergoing the implantation procedure. [Jamuna & Abnurajan]68 measured velocity and pressure of the blood flow through a patient-specific aorta in different conditions. Those conditions are normal aorta, aorta with plaque at the valve sinus side and aorta with needed bi-leaflet valve implant. The authors incorporated a computed tomography (CT) image of a patient-specific and analyzed by using CFD simulation. It is observed that the blood pattern after implanting a valve is similar to normal aorta, where an increased percentage of velocity and blood pressure are shown

A.A. Basri, M. Zubair, A.F.A. Aziz, R.M. Ali, M. Tamagawa, K.A. Ahmad, “Computational Fluid Dynamics Study of the Aortic Valve Opening on Hemodynamics Characteristics”, in: 2014 IEEE Conf. Biomed. Eng. Sci. 8 - 10 December 2014, Miri Sarawak, IEEE, 2014: pp. 99–102. doi:10.1109/IECBES.2014.7047660. 67 F.P.P. Tan, X.Y. Xu, R. Torii, N.B. Wood, N. Delahunty, M. Mullen, et al., “Comparison of Aortic Flow Patterns Before and After Transcatheter Aortic Valve Implantation”, Cardiovasc. Eng. Technol. 3 (2012) 123–135. doi:10.1007/s13239-011-0073-3. 68 J. C., M. Abnurajan, “Design of Patient Specific Prosthetic Aortic Valve and to Study its Computational Fluid Dynamics”, 3rd Int. Conf. Electron. Comput. Technol. 3 (2011) 355–360. 66

72

to be 58.5% and 81.8% respectively. [Sirois et al.]69 also studied the implantation of aortic valve on a patient-specific by using CT images and CFD simulations. The authors performed a quantitative analysis of the hemodynamic in terms of blood flow patterns before and after implantation procedure. A reduction of pressure drop at 25.27 mmHg and increased of effective orifice area from 0.53 -1.595 cm2 shown a significant result following the valve implantation. The assessment of hemodynamic properties are carried out considering parameters of wall shear stress, oscillatory shear index and average wall shear stress gradient of 30 patients. [Gao et al.]70 also studied the coronary artery disease of stent implantation as an interventional procedure for the disease treatment. The authors compared the blood flow before and after stent implantation and analyzed the parameters in terms of wall shear stress and blood velocity. From the study, the wall shear stress and blood velocity are greater at the region of stenosis prior to implanting the stent of by which the result shows a reduction of maximum flow rate in coronary artery and an increment value of wall shear stress after the implantation procedure. [Chaichana, et et.]71 studied the hemodynamic effects of simulated plaque in left coronary artery models of patient-specific coronary stenosis. Three parameters are measured namely wall shear stress, pressure gradient and flow velocities by using CFD analysis and compared between the presence and absence of plaques in the left coronary models during cardiac cycle. It is observed that highest pressure gradient in stenotic regions caused by the plaques and lower flow velocity areas found at post plaque locations but wall shear stress is similar at the stenotic regions. 4.1.2 Respiratory Systems Despite the importance of cardiovascular system for blood circulation and nutrients transportation throughout the human body, respiratory system also plays an essential role for human lung function primarily for nasal breathing. A CFD-based analysis provides better understanding of airflow characteristic incorporated with fluid dynamics in nasal cavity to obtain functional and anatomical data. Researches of CFD applications in regards to respiratory system received attentions concerning the basic airflow studies on the physiology of nose, drug deposition and virtual surgery of surgical intervention. Recent studies conducted by combining computational analysis with imaging to gain significant realistic numerical simulations of respiratory system. [Segal et al.]72 studied the differences in respiratory flow patterns of four difference human nasal cavities by using MRI scans and CFD simulations. The study is conducted by performing numerical simulation of steady state inspiratory laminar airflow for flow rate of 15 L/min and compared the measurements in terms of streamline patterns, velocities and helicity values. The authors observed that the majority of flow passed through the middle and ventral regions of nasal passages; however it is found that there is variance of the amount and location of swirling flow among subjects. [Wen et al.]73 also simulated steady laminar nasal airflow for flow rate of 7.5 to 15 L/min to present flow patterns between the left and right nasal cavities by adopting CFD simulation software (FLUENT®) and CT scan images of human nasal cavity models. The authors measured the flow patterns features included high velocities in the constrictive nasal valve area region, vortex formations posterior to the nasal valve regions and high flow close to the septum walls. The results shows the nasal resistance value within the first 2-3 69 E.

Sirois, Q. Wang, W. Sun, “Fluid Simulation of a Transcatheter Aortic Valve Deployment into a Patient-Specific Aortic Root,” Cardiovasc. Eng. Technol. 2 (2011) 186–195. doi:10.1007/s13239-011-0037-7. 70 F. Gao, G. Li, R. Hu, H. Okada, “Computational Fluid Dynamic Analysis of Coronary Artery Stenting”, Int. J. Biosci. Biochem. Bioinforma. 4 (2014) 155–159. doi:10.7763/IJBBB.2014.V4.330. 71 T. Chaichana, Z. Sun, J. Jewkes, “Computational fluid dynamics analysis of the effect of plaques in the left coronary artery.”, Computational Math. Methods Med. 2012 (2012) 504367. doi:10.1155/2012/504367. 72 R. a. Segal, G.M. Kepler, J.S. Kimbell, “Effects of differences in nasal anatomy on airflow distribution: A comparison of four individuals at rest”, Ann. Biomed. Eng. 36 (2008) 1870–1882. 73 J. Wen, K. Inthavong, J. Tu, S. Wang, “Numerical simulations for detailed airflow dynamics in a human nasal cavity, Respir”. Physiol. Neurobiol. 161 (2008) 125–135. doi:10.1016/j.resp.2008.01.012.

73

cm contribute up to 50% of the total airway resistance and vortices were found at the upper olfactory region and posterior to the nasal valve region. Croce et al.74 also simulated steady state laminar airflow for flow rate of 353 ml/s in left and right nostrils using CFD simulation software (FLUENT®) from CT scan images of a palatinate head using a commercial software package AMIRA (Mercury Computer System, Berlin). The authors described the flow patterns in a physiologically realistic bi-nasal model considering the pressure drop. The results found that the major total pressure drop in the nasal valve region and predominant airflow in the inferior median part of nasal cavities. Vortices also are observed downstream from the nasal valve and towards the olfactory region. Other than the basic airflow studies on the physiological function of the nose, drug deposition is of fundamental importance in the treatment of different lung disease and allergies. The recent study of CFD in relations to drug deposition received great interest in order to characterize the local deposition patterns and optimize drug delivery in the respiratory system. [Bahmanzadeh et al.]75 studied the effect of endoscopic sphenoidotomy surgery on the flow patterns and deposition of micro-particles in the human nasal passage and sphenoid sinus. The authors presented transient airflow patterns of pre- and postsurgery during a full breathing cycle under cyclic flow condition. The transport and deposition of inhaled micro-particles are evaluated by using Lagrangian approach to determine the unsteady particle which entering the nasal airway for inhalation phase of breathing cycle. The study found that the increased airflow due to sphenoidotomy and increased deposition of micro-particles in the sphenoid region. In the post-operative case, 25μm particle size is observed to be able to penetrate into the sphenoid region and highest deposition for 10μm particles at about 1.5% occurred during resting breathing. [Dastan et al.]76 studied the deposition of fibrous particle in different human nasal passages by using CFD simulations. The authors developed an in-house code to solve the combined equations of translational and rotational of motion of ellipsoids for fiber transport and deposition in the nasal airways. The result shows a significant effect of deposition fraction by virtue of variation of nasal airways. The deposition fraction is highly affected by the nasal geometry and of airflow rate in the nasal valve and main airway regions. Hence, it is proven that the aerodynamic diameter based on the Stokes equivalent diameter employed in the impaction parameter could collapse the simulation data of spherical and fibrous particles to a single curve. [Abouali et al.]77 studied the airflow distribution and particle deposition in the nasal airway, maxillary and frontal sinuses on the developed virtual ungiectomy and middle meatalantrostomy. The study considered the inhalation of macro and Nanoparticles to determine the penetration of airflow into the sinus cavity. The micro-particle consists of the evaluation of the path and deposition of particles in the nasal passages and maxillary sinuses by using a Lagrangian trajectory analysis approach. Meanwhile, the Nano-particles included the transport and deposition analysis by using a diffusion model. The rate of particle deposition in the maxillary and frontal sinuses are analyzed and compared between pre and post-surgery conditions. The result shows that almost no particles entered the sinuses in the pre-operative condition. However, the inhaled Nano and micro particles easily entered the sinuses due to the increase of C. Croce, R. Fodil, M. Durand, G. Sbirlea-Apiou, G. Caillibotte, J.-F. Papon, et al., “In Vitro Experiments and Numerical Simulations of Airflow in Realistic Nasal Airway Geometry”, Ann. Biomed. Eng. 34 (2006) 997–1007. doi:10.1007/s10439-006-9094-8. 75 H. Bahmanzadeh, O. Abouali, M. Faramarzi, G. Ahmadi, “Numerical simulation of airflow and micro-particle deposition in human nasal airway pre- and post-virtual sphenoidotomy surgery”., Comput. Biol. Med. 61 (2015) 8–18. doi:10.1016/j.compbiomed.2015.03.015. 76 A. Dastan, O. Abouali, G. Ahmadi, “CFD simulation of total and regional fiber deposition in human nasal cavities”, J. Aerosol Sci. 69 (2014) 132–149. doi:10.1016/j.jaerosci.2013.12.008. 77 O. Abouali, E. Keshavarzian, P. Farhadi Ghalati, A. Faramarzi, G. Ahmadi, M.H. Bagheri, “Micro and nanoparticle deposition in human nasal passage pre and post virtual maxillary sinus endoscopic surgery”, Respir. Physiol. Neurobiol. 181 (2012) 335–345. doi:10.1016/j.resp.2012.03.002. 74

74

airflow penetration into the sinus cavity after surgery. Despite that, virtual surgery in relation to CFD simulation also received great interest as to determine the best possible surgical treatment in a constricted airway78. In most studies, the virtual surgery consists of removing one or both of the obstruction in different proportions in order to enhance the nasal airway comparing to its baseline condition. The recent study by [Moghadas et al.]79 studied the effect of septal deviation on the flow patterns and deposition of micro/nanoparticles in the realistic human nasal airways before and after septoplasty. The authors simulated the steady airflows through the nasal passage by using Eulerian and Lagrangian approaches for Nano- and micro- particles. From the simulation, the results shows the flow field and particle deposition depending on the passage geometry. For micro-particles, the deposition rate with septal deviation is higher compared to the normal and post-operative passage. Meanwhile, the deposition of Nano-particles shows similar trends for both normal and postoperative passage. Hence, the aid of simulation provides a suitable tool for predicting the airflow and particle deposition patterns in the nasal passages that specific surgical interventions would produce. [Xiong et al.]80 compared nasal airflow after two different surgical interventions involving three facets such as opening the paranasal sinuses, excising the ethmoid sinuses, and excising or preserving the uncinated process, in a cadaveric head model through CFD simulations. The study found a significant large nasal cavity airflow velocity changes are apparent during the procedure of uncinated process and similar nasal cavity airflow when preserving the uncinated process. The uncinated process excising procedure shows a greater increase in airflow volume compared with the uncinated process preserving procedure. Previously, [Xiong et al.]81 carried out a numerical simulation of nasal cavity airflow pre and post virtual functional endoscopic surgery (FESS) with the aid of CFD simulations (FLUENT). The authors aim to investigate and numerically visualize the airflow trace, distribution, velocity, air pressure and airflow exchange between the nasal cavity and paranasal sinus on a normal adult subject. The result shows an increased airflow distribution in the maxillary, ethmoid and sphenoid sinuses, and the increment of 13% through the area connecting the middle meatus and the surgically opened ethmoid. On the other hand, Garcia et al.82 used CFD simulations of medical imaging software (MIMICs, Materialize) to study the airflow, water transport, and heat transfer in the nose of an Atrophic Rhinitis (AR). The subject is a patient that received a treatment of a nasal cavity-narrowing procedure which implanted a rib cartilage under the mucosa along the floor of the nose and removed septum spur. The reconstructed nose is simulated and the nasal airflow was assumed as laminar with 15 L/min corresponding to resting breathing rate. The simulation shows that the anthropic nose lead to unconditioned inspired air as effectively as the healthy geometries.

4.2 Merits and Limitations of Biomedical Applications in CFD The CFD model is applied as what described before, but now includes in Clinical Imaging (MRI, CT scan, etc. ) in preprocessing phase83. The validation is of course can be done with the same tools in post processing. The whole procedure is shown in Figure 4.1. CFD has received increasing interest G. Mylavarapu, “Computational Flow Modeling of Human Upper Airway Breathing, University of Cincinnati”, 2013. http://gradworks.umi.com/36/01/3601415.html. 79 H. Moghadas, O. Abouali, a. Faramarzi, G. Ahmadi, “Numerical investigation of septal deviation effect on deposition of nano/microparticles in human nasal passage”, Respir. Physiol. Neurobiol. 177 (2011) 9–18. 80 G.-X. Xiong, J.-M. Zhan, K.-J. Zuo, L.-W. Rong, J.-F. Li, G. Xu, “Use of computational fluid dynamics to study the influence of the uncinated process on nasal airflow”., J. Laryngol. Otol. 125 (2011). 81 G. Xiong, J.-M. Zhan, H.-Y. Jiang, J.-F. Li, L.-W. Rong, G. Xu, “Computational fluid dynamics simulation of airflow in the normal nasal cavity and paranasal sinuses”, Am. J. Rhinol. 22 (2008) 477 – 482. 82 G.J.M. Garcia, N. Bailie, a Martins, J.S. Kimbell, G. Gj, N. Bailie, et al., Atrophic rhinitis : “a CFD study of air conditioning in the nasal cavity”, 27709 (2007) 1082–1092. doi:10.1152/japplphysiol.01118.2006. 83 P.D. Morris, A. Narracott, H. Von Tengg-kobligk, D. Alejandro, S. Soto, S. Hsiao, et al., “Computational fluid dynamics modelling in cardiovascular medicine”, 2015, 1–11. doi:10.1136/heartjnl-2015-308044. 78

75

from mathematical curiosity to become an important technique to study complex physiological flows pattern and demonstrating their potential especially in cardiovascular and respiratory systems. To date, CFD has been adopted by medical researchers to facilitate in predicting the characteristic of circulatory blood flow inside the human body and airflow in human nasal breathing. Hence, it offer benefits such as lower the chances of postoperatives complications, facilitate in developing better surgical treatment, high efficiency with less destructive medical equipment and convey a good understanding of biological procedures84. From theoretical point of view, CFD provides benefits by concentrating on the construction and solution of governing equations and the study of numerous approximations to these equations. Meanwhile, the experimental and numerical approaches highlighted the merit of CFD as an alternative cost-effective means of simulating real fluid flow, particularly involving human body systems. Hence, it provides detailed visual and comprehensive information when comparing the fluid dynamics of analytical and experimental approaches. Despite the merits of CFD, there are also some limitations of applying CFD. CFD is limited to describe physical models and quality of input data of real world processes in order to determine the accurate CFD solutions such as turbulence, multiphase flow, and compressibility. Thus, numerical results must be thoroughly analyzed and examined in order to properly make critical judgements about the computed results. Figure 4.2 CFD Model Construction for Biomedical Furthermore, numerical errors may occur Application when solving equations on a computer invariable such as round-off error and truncation error. This is due to the practicability of CFD depending on several factors such as specific materials and process, accurate algorithm for the governing equations, powerful CFD packages, as well as high speed and large computers.

B. K. Lee, “Computational fluid dynamics in cardiovascular disease”., Korean Circ. J. 41 (2011) 423–30. doi:10.4070/kcj.2011.41.8.423. 84

76

4.3

Hemodynamic Flow Modeling

The equations describing incompressible flows may be written as

.v = 0

,

 v  + .v  = σ + f  t 



,

σ = -pI + τ

Eq. 4.1

where v is the velocity vector, σ is the stress tensor f, and is the external or body force which is assumed zero here and τ is the stress tensor can be decomposed to hydrostatic and deviatory stress which is a function of the shear rate tensor (D) as

𝛕 = μ(γ̇ )𝐃

,

1 𝐃 = (∇𝐯 + ∇𝐯 𝐓 ) 2

,

1 γ̇ = √ ∑ ∑ 𝐃𝐢𝐣 𝐃𝐣𝐢 2 i

j

Eq. 4.2

where μ is the dynamic blood viscosity, and ϔ is the shear rate. Blood is a non-Newtonian fluid, implying that the viscosity μ depends on the strain rate tensor. The last two decades have seen impressive progress in our ability to solve these equations in an expedient manner. Key elements of any modern incompressible flow solver include such as an Arbitrary Lagrangian-Eulerian (ALE) formulation for moving walls (deforming grids) with implicit time stepping. To employ the system of equations, constitutive equations are needed to calculate the viscosity of the blood. Previously, a variety of constitutive equations have been proposed to model blood flow. The simplest model is a Newtonian fluid which assumes a constant viscosity (μ = μ0). Recent studies, however, suggested that the shear dependent viscosity models can accurately capture shear thinning nature of blood flow. The most common non-Newtonian models used for the blood is the Power law which can be expressed in the following from:

μ = k(γ̇ )n−1 Eq. 4.3

where k is the flow consistency index and n is the power law index, showing the non-Newtonian behavior of the blood85. This mathematical description is one of the simplest models used for representing the behavior of a non-Newtonian fluid. However, since this model ranges from zero shear rates to infinity when shear rate approaches zero, only values in the realistic range can approximate a non-Newtonian fluid behavior. The power law index is usually chosen so that the model reproduces the shear thinning behavior of the blood in hemodynamic simulations. Regardless, both k and n depend the components of blood, mainly hematocrit, and are subject to change for each individual86. Following equation gives the dynamic viscosity with regards to this viscosity model87.

4.4

Boundary Conditions

The imposition of proper flow boundary conditions represents one of the most difficult, and 85 Hussain, M.A., Kar, S., Puniyani, R.R., “Relationship between power law coefficients and major blood constituents

affecting the whole blood viscosity”. J. Biosci. 24(3), 329–337 ,1999. 86 Cho, Y.I., Kensey, K.R.: “Effects of the non-Newtonian viscosity of blood on flows in a diseased arterial vessel. Part 1: steady flows”. Biorheology 28, 241–262, 1991. 87 Hamidreza Gharahi, Byron A. Zambrano, David C. Zhu, J. Kevin DeMarco, Seungik Baek, “Computational fluid dynamic simulation of human carotid artery bifurcation based on anatomy and volumetric blood flow rate measured with magnetic resonance imaging”, International J Advanced Engineering Science Applied Math DOI 10.1007/s12572-016-0161.

77

admittedly questionable, aspects of patient-specific simulations. In the first place, the flux data is not easy to obtain. Measuring velocity prowls via Phase-Contrast MRA (PCMRA) requires non-standard imaging protocols and a longer scanning time. Then there is the question of resolution. The number of pixels required for accurate vessel geometry reconstruction is much lower than the number of pixels required for accurate flow profile reconstruction. Only the velocity normal to the MRA cut is measured, i.e. a complete characterization of the velocity field would require even longer scanning times. For this reason only the velocity normal to a cut is measured, i.e. all cross velocity information is lost. For some vessels, peak velocities can be measured using ultrasound techniques, and these can in turn be used to impose boundary conditions. On the other hand, we know that the ow in curved tubular structures can exhibit considerable cross ow, and that any form of cross ow can have significant effects downstream. To date, most CFD simulations have been carried out prescribing fully developed, time dependent velocity profiles derived from flow-rate curves using the Womersley solution. The Womersley solution holds only for pulsating flow in an infinitely long circular cylinder. For other vessel cross-sections the Womersley profiles are mapped accordingly. Pressure boundary conditions are important for fluid-structure interactions (FSI) simulations with compliant walls. Pressures can be obtained invasively using catheters, but it would be highly desirable to develop noninvasive pressure measuring techniques. Major outstanding problems in this field are: • •

The derivation of post-operative boundary conditions from pre-operative data, & The derivation of boundary conditions when complete information is unavailable.

We do not expect to be able to obtain complete flow and pressure data for this complex arterial system for years to come.

4.5 Structural Deformation Models Arterial wall movement will have a profound effect on local ow conditions. One observes that fluxes do not 'add up' if the deformation of the wall is neglected. In principle, the vessel wall and the surrounding medium can be modeled using a structural dynamics solver for 3D nonlinear, largedeformation behavior. However, the difficulties in obtaining proper initial and boundary conditions are even more pronounced here than for the ow solver. The material is highly nonlinear, orthotropic, layered, may be responding, etc. How to obtain this information non-invasively is, at this point, an open question. For this reason, most wall deformations have been computed using shells [Per, Zha] or, even simpler, an independent ring model [Qua]. In this case, the normal wall displacement η is obtained from:

m ηtt + d ηt + k η = p , m = ρ w h , k =

Eh (1 − ν 2 )r 2

Eq. 4.4

where ρw, h, r, E, ν denote, respectively, the wall density, thickness, vessel radius, Young's modulus and Poisson ratio. This equation is integrated using a second-order implicit time integration scheme.

4.6 Fluid-Structure Interaction Techniques Given that vessel deformation plays an important role for local flux evaluations, the fluid and structure models must be combined88. Due to their generality, modularity and extendibility so-called loose coupling techniques have been used extensively in engineering. The key idea is to have a master Rainald Lohner, Juan Cebral, Orlando Soto, Peter Yim, James E. Burgess, “CFD in Medicine and Life Sciences Applications on the Living Human Being”, George Mason University, Fairfax, VA 22030-4444, USA. 88

78

code that invokes the fluid and structural codes alternatively in such a way that a minimum of changes are required for the latter. For implicit CFD and CSD codes, we use the following under relaxed predictor- corrector scheme for each time step; while not converged:

while : not convergerd update structure with fluid load : x Si = (1 − α) x Si −1 + α f (σ iF ) update fluid with structure position/velocity i −1 F

Eq. 4.5

σ = (1 − α) σ + α g (x ) i F

i S

end while Typical under relaxation factors are in the range 0.5 ≤ α ≤ 0.9. Subscripts F and S denote the fluid and solid phases respectively. The xi s is the structure response to fluid forces f(σiF), is the structure surface deformation due to fluid forces and g(xi,s) is the change of fluid forces projected by structure deformation.

4.7 Future of CFD in Biomedical Engineering Rapid developing of an outstanding major computational modelling and technological challenges, which directed towards the evolution of CFD has recognized by regulatory authorities. Excellent creative models and the development of novel applications for simulating complex fluid mechanics challenges in regards to human anatomy of cardiovascular and respiratory systems are now being progressively applied with the ability of CFD simulation programs. Therefore, it is important to demonstrate the effectiveness of simulations results relative to invasive measurement through observational trials, particularly in multicenter clinical studies. Clearly, that these methods will direct towards high potential to change clinical practice that benefits to patients, health providers and clinicians. The ability to predict accurately flows in the vascular and pulmonary system on a patientspecific basis has increased dramatically in the last years. We expect progress to continue in all the areas that encompass a comprehensive simulation capability: image segmentation, grid generation, flow solvers, fluid- structure interaction (FSI), data reduction and visualization. Some of the outstanding questions involve boundary conditions, material parameters (in particular for wall compliance), and the clinical significance of particular flow phenomena. At present, image-based, patient-specific computational hemodynamics can be used to: • • •

Study vascular diseases; Enhance diagnosis; and Plan surgical procedures.

Imaging modalities will continue to evolve and eventually both anatomy and physiology will be accurately visualized. However, the power of computer simulations lies in their ability to predict the outcome of procedures, i.e. the answer to 'what if' questions that can be useful for optimizing therapies89. Looking into the more distance future, we predict: • • •

CFD enhanced radiology, Simulations of long-terms effects, such as plaque formation, Simulations of drug delivery and effects, and

Rainald Lohner, Juan Cebral, Orlando Soto, Peter Yim, James E. Burgess, “CFD in Medicine and Life Sciences Applications on the Living Human Being”, George Mason University, Fairfax, VA 22030-4444, USA. 89

79

•

The coupling of flow codes (continuum level) and particle codes (molecular level).

4.8 Case Study 1 - Modeling Fluid-Structure Interaction in a Heart Valve Background The average human heart beats around 100,000 times in a single day, with almost perfect pumping. With each beat, the four valves within the heart open and close, transporting blood uni-directionally through its chambers simulating heart valves, medical researchers can study their behavior to address various cardiac health issues. As an example, a team from [Veryst] Engineering modeled a heart valve opening and closing with the aid COMSOL Multiphysics® software90. Advancing Heart Valve Research via Simulation The four valves in a human heart are flexible enough to both fully open, enabling blood to flow in one direction through the heart, and tightly close, sealing the heart chambers and preventing backflow. However, with valvar heart diseases, the valves do not function properly, which can cause serious cardiac health issues. As a result, studying heart valves is an important research area. One recent advancement in heart valve research has been the development of the smallest approved mechanical heart valve in the world. This is an important achievement, as every year, over 35,000 babies in the United States alone are born with congenital heart defects. For some of these newborns, the defects result in malfunctioning heart valves that require surgery to fix. Of course, the creation of the smallest approved valve is only one example of innovation in Figure 4.3 Schematic of a Heart. Image by Wapcaplet (Licensed heart valve research. This area via Wikimedia Commons) has also sparked the interest of a team at Veryst Engineering, a COMSOL Certified Consultant who has worked with clients on similar real-world problems. To further advance heart valve research, the team was inspired to create an example model of a heart valve. Such a model could serve as a valuable design tool, providing crucial information to medical researchers. (see Figure 4.3). Modeling the Opening and Closing of a Heart Valve in COMSOL Multiphysics As you might expect, modeling a human heart valve can be difficult and computationally expensive. For one, this problem involves strongly coupled fluid-structure interaction (FSI), with a moving and 90

Caty Fairclough, “Modeling Fluid-Structure Interaction in a Heart Valve “, COMSOL Blog, 2018.

80

deforming structure interacting with a flowing fluid. In addition, it’s important to accurately account for nonlinear material behavior, contact modeling, and fluid-mesh movement. To address this challenge, [Nagi Elabbasi] (a member of the Veryst team) used COMSOL Multiphysics, saying that the software provides a “unique capability to capture all [of] the coupled effects involved. Using COMSOL Multiphysics, [Elabbasi] created a simple example to highlight how engineers can overcome the challenges of modeling realistic heart valves and predict their behavior. In this model, a heart valve opens and closes in response to the fluid flow. Modeling this movement wasn’t easy, with [Elabbasi] noting that “the main challenges in this model are the closing of the heart valve and accurately representing the material behavior of the valve.” This poses an issue because the fluid mesh can collapse when the heart valve is closed. To avoid excessive mesh distortion, the team opted to use the advanced mesh control features in the COMSOL® software. Simulation Results for Fluid-Structure Interaction in a Heart Valve Let’s now take a look at some of the results the team at [Veryst] obtained from their heart valve model, which analyzes flow patterns, variations, and residence times; flow recirculation around heart valves; and how these factors are affected by the movement of a valve. It’s also possible to use the model to investigate stress and fatigue in the valve material as well as blood pressure, shear stresses, and deformation. The team also found that simulation enabled them to analyze multiple aspects of the heart valve at once, such as the interaction between blood velocity, valve deformation, and von Mises stress in the valve. The model results (see Figure 4.4) show that there are dead flow zones around the valve and recirculation in the fluid. Both of these factors are affected by the opening and closing of the valve. In addition, the root of the valve has high stresses. Researchers can use these results to identify potential issues and improve the designs of artificial heart valves. Please note that because this example was made only to demonstrate what you can achieve when modeling heart valves, the results seen here are not completely realistic.

Figure 4.4

FSI Model of a Heart Valve Opening (left) and Closing (right)

Improving the Design of Medical Devices with FSI Modeling This example shows what medical researchers can achieve by using FSI simulation. Using models like this one, researchers and engineers can predict the behavior of real heart valves, potentially using this information to improve the designs of artificial ones. [Elabbasi] also mentioned that “FSI modeling should be performed by all medical device companies working on heart valves, providing related products (stents, for example), or analyzing cardiovascular diseases (aneurysms, for example).” The information provided by such simulations will help improve the design of medical devices, making them more effective in treating diseases.

81

4.9 Case Study 2 – CFD Simulation of Human Carotid Artery Bifurcation based on Anatomy and Volumetric Blood Flow Rate Measured with MRI Hemodynamics and geometric variables play a crucial role in the appearance and progression of various vascular diseases. Specifically, the significance of Wall Shear Stress (WSS) and flow disturbances in the formation and rupture of an atherosclerotic plaque, which is the leading cause of stroke, is well acknowledged. Research shows that there is an increased chance of atheroma buildup around vessel bifurcations, where the blood flow is stagnant or highly disturbed. Additionally, WSS is proposed to be a controlling factor in the mechanism of plaque formation and rupture. These findings have persuaded researchers to grow an interest in the development of techniques that enable them to estimate WSS. Through phase-contrast (PC) magnetic resonance imaging (MRI) velocity measurements, qualitative and quantitative assessment of WSS is possible. Essentially, PCMRI measures the blood and tissue velocity at each point in the field of view. However, the relatively low spatial in-plane resolution of the image and difficulty in circumferential wall’s detection obstructs an accurate estimation of WSS. Another approach is to investigate WSS by employing the Computational Fluid Dynamic (CFD) simulation of the blood flow in patient specific models. Essential components of carrying out such simulations require accurate anatomic models, imposition of realistic boundary conditions, utilization of an appropriate viscosity model, and inclusion of the wall elasticity. Although accurate segmentation of blood vessels is a crucial step in analysis of blood circulation inside the body, realistic boundary conditions are suggested to be as important for accurately estimating the flow rate with three-dimensional CFD simulations. 4.9.1 Approaches Several approaches have been proposed to impose physiologically realistic boundary conditions. A common boundary condition type is the resistance boundary condition. This boundary condition does not require any specification of flow rate or pressure at the outlet. However, the resultant flow and pressure waves are forced to be in the same phase in the resistance boundary condition, which violates the wave propagation phenomena. An alternative method is to use 1D method to solve the periodic blood flow in downstream vessels to provide boundary condition for the 3D computational domain91-92. Solution to a large number of downstream vessels, however, requires some simplifications, which leads to restriction of the method to periodicity of the blood flow. Therefore, a 0D model is proposed to track the dynamic nature of time dependent flow in human arteries. The 0D modeling approach utilizes the concept of a hydraulic-electrical analogue which is known as the [Windkessel model]93. By prescription of an impedance of the downstream vessels at the outlets, the Windkessel model will facilitate the imposition of realistic boundary conditions for 3D simulations of blood flow. While most hemodynamic simulations employ the Newtonian model for arterial flow, during the past decade several studies have suggested that appropriate nonlinear viscosity models should take an account of the key factors in hemodynamics simulations. Such shear rate dependent-viscosity models have been proposed in literature, most commonly using Power-law, Carreau-Yasuda, and Casson models. Regardless, the viscosity of the blood is dependent on several factors such as the hematocrit level. In contrast, other studies have suggested that the nonlinear effect is negligible in large arteries such as carotid arteries. Therefore, this study aims to investigate the nonlinear effect on Formaggia, L., Gerbeau, J.F., Nobile, F., Quarteroni, A., ” On the coupling of 3D and 1D Navier–Stokes equations for flow problems in compliant vessels”, Computer. Methods Appl. Mech. Eng. 191(6–7), 561–582 (2001) 92Lagana`, K., Dubini, G., Migliavacca, F., Pietrabissa, R., Pennati, G., Veneziani, A., Quarteroni, A. “Multiscale modelling as a tool to prescribe realistic boundary conditions for the study of surgical procedures”. Biorheology 39(3–4), 359–364 (2002) 93 Shi, Y., Lawford, P., Hose, R.: Review of zero-D and 1-D models of blood flow in the cardiovascular system. Biomed. Eng. OnLine. 10, 33–71 (2011). 91

82

hemodynamics factors and further improve the procedure to quantify uncertainties using PC-MRI measurements. Although several studies have shown that these simplifications are acceptable for hemodynamic simulation94, there is still need to perform simulations as realistic as possible. To model a realistic hemodynamic simulation in large arteries completely, the deformability of the arteries’ walls should also be considered. However, for the sake of simplicity, rigid walls are assumed here. 4.9.2 Results and Discussion CFD analysis was performed using the computed wave forms as boundary conditions for a 1 million element mesh with time step of 0.005 s for four cardiac cycles. The heart rate was assumed 60 bpm for all the simulations. The simulations were performed for different viscosity models and the axial velocity contours on axial slices are represented in Figure 4.5 (AB) over one cardiac cycle. On the slice right below the bifurcation, negative axial velocity is present which implies that the blood is recirculating around this area. Moreover, the velocity profiles do not vary drastically from (B) Power Law (A) Newtonian model to model, as it can be observed in Figure 4.5 (A-B). However, near the recirculation zone where the velocity magnitude is the lower, slightly different contour lines can be noticed. The distribution of TAWSS on the carotid artery wall is demonstrated in Figure 4.5 (C-D). The difference between different viscosity models can be illustrated through WSS contours. For instance, near carotid sinus, where the carotid artery expands, dissimilar contour lines are observable. Besides, the lowest TAWSS is seen in this region. This is especially important for the researchers because carotid sinus is the location where the plaque accumulates. As expected, the highest WSS occurs near the junction points due to high velocity gradients. Similar to what was observed in other studies, (D) Power Law (C) Newtonian the results show that the blood flow is predominantly unidirectional during the cardiac cycle for the healthy patient, and therefore, Figure 4.5 Axial velocity and Time Average Oscillatory Shear Index (OSI) is almost zero everywhere on the whole circumference of the domain. Although the Power-law and Newtonian models are very similar to each other, the discrepancy in the TAWSS is more noticeable in the patient carotid artery model than that of the healthy subject, particularly around the plaque region. Near the outlet region of the ECA, the Newtonian model shows higher values. Figure 4.6 depicts the anatomical model and the velocity 94 Steinman, D.A.: Assumptions in modelling of large artery hemodynamics. In: Ambrosi,

G. (eds) Modeling of physiological flows, pp. 1–18. Springer, Milan (2012)

D., uarteroni, A., Rozza,

83

contours for the patient model. It indicated an anatomic model for the patient with carotid artery plaque (Left) and axial velocity at peak systole (Right). Since the geometry is obviously more twisting, the blood flow is more unpredictable in this case. Starting from the inlet, the velocity profile is almost parabolic at the beginning. As the flow approaches the plaque and bifurcation region, it follows the geometry of the unhealthy carotid artery. Recirculation regions appear near the carotid sinus throughout the cardiac cycle. In addition, low velocity flow occurs near the carotid bifurcation and carotid sinus. However, the flow seems to be fully developed superior to the ICA.

Figure 4.6

Anatomic Model for the Patient with Carotid Artery Plaque

4.10 Case Study 3 - CFD Analysis of the Effect of Plaques in the Left Coronary Artery Coronary Artery Disease (CAD) is the leading cause of death in advanced countries. The most common cause of CAD is atherosclerosis which is caused by the presence of plaques on the artery wall, resulting in the lumen stenosis95. Plaques have been particularly associated with blood clots and compromise blood flow to the myocardium. This occurs when the coronary plaques suddenly rupture; if a clot cannot be treated in time, then the heart muscle will be impaired due to ischemic changes, leading to myocardial ischemia or infarction or, more severely, necrosis. Therefore, an early detection and diagnosis of CAD is particularly important for reduction of the mortality and subsequent complications96. The natural history of coronary plaque is dependent not only on the formation and progression of atherosclerosis, but also on the vascular remodeling response. If the local wall shear stress is low, a proliferative plaque will form. Local inflammatory response will stimulate the formation of so-called “vulnerable plaque” which is prone to rupture with superimposed thrombus formation. The vast majority of these inflamed high-risk plaques cannot be detected by anatomic and myocardial perfusion imaging. Since the progression and development of 95 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect

of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 96 Australian Institute of Health and Welfare, “The tenth biennial health report of the Australian Institute of Health and Welfare,” AIHW, Canberra, Australia, 2006.

84

vulnerable plaque is associated with low wall shear stress and the presence of expansive remodeling, measurement of these characteristics in vivo will enable risk stratification for the entire coronary circulation97. Figure 4.7 shows a 3D CT visualization of a normal left coronary artery with side branches in a patient with suspected coronary artery disease. The wall shear stress (WSS), wall pressure, and blood flow changes in the human body cannot be measured directly on blood vessels, whereas computational fluid dynamics (CFD) can provide alternative ways to diagnose CAD98. The WSS factor in the coronary artery is known to play a significant role in the early formation of CAD. In addition, the WSS at the local vessel wall can demonstrate a predisposition for atherosclerosis development for various Figure 4.7 3D CT visualization of a normal left anatomical sections, thus enabling the coronary artery with coronary artery disease prediction of coronary disease. CFD allows for efficient and accurate computations of hemodynamic features of both normal and abnormal situations in the cardiovascular system, in vivo simulation of coronary artery flow changes. CFD is different from medical imaging visualization as medical imaging techniques such as coronary angiography or computed tomography angiography provide anatomic alterations of the coronary artery wall due to the presence of plaques, thus allowing only assessment of the degree of lumen changes such as stenosis or occlusion. In contrast, CFD analysis enables the identification of hemodynamic changes in the coronary artery, even before the plaques are actually formed at the artery wall or can occlude the vessels. Therefore, to some extent, CFD allows early detection of coronary artery disease and improves the understanding of the progression of plaques, which are considered of paramount importance to clinical treatment. The purpose of this study was to investigate the hemodynamic effect of plaques in the left coronary artery by using CFD analysis. Simulated plaques were inserted into the left main stem and left anterior descending coronary arteries (taken from a selected patient’s data), and hemodynamic analysis was performed to correlate the effect of presence of plaques with subsequent flow changes to the coronary main and side branches. 4.10.1 Patient Data Selection for Generation of Left Coronary Artery Model A sample patient suspected of CAD who underwent multi slice CT angiography was selected, and the patient’s volume CT data was used to generate a 3D coronary model. The original CT data was saved in digital imaging and communication in medicine (DICOM) format and then transferred to a workstation equipped with Analyze 7.0 (Analyze Direct, Inc., Lexana, KS, USA) for image afterprocessing and segmentation. Three-dimensional (3D) volume data was post-processed and 97 F.

J. Rybicki, S. Melchionna, D. Mitsouras et al., “Prediction of coronary artery plaque progression and potential rupture from 320-detector row prospectively ECG-gated single heart beat CT angiography: lattice Boltzmann evaluation of endothelial shear stress,” International Journal of Cardiovascular Imaging, vol. 25, 2009. 98 S. K. Shanmugavelayudam, D. A. Rubenstein, and W. Yin, “Effect of geometrical assumptions on numerical modeling of coronary blood flow under normal and disease conditions,” Journal of Biomechanical Engineering, vol. 132, no. 6, article 061004, 2010.

85

segmented using a semiautomatic method with a CT number thresholding technique99-100, and manual editing was performed in some slices to remove soft tissues and artefacts. The segmented model was produced with a special focus on the Left Coronary Artery (LCA) and its branches. The 3D LCA model was saved in “STL format” for further reconstruction purposes. Figure 4.7 shows the anatomical details of the left coronary artery101. 4.10.2 Realistic Plaques Modelling The actual plaques and degree of lumen stenosis on coronary artery wall were simulated at the Left Main Stem (LMS) and the Left Anterior Descending (LAD), as these artery branches are the common locations where plaques tend to form and induce myocardial ischemic changes. The plaques produced a lumen narrowing of approximately 60% diameter at the LMS and LAD, since more than 50% lumen stenosis leads to significant hemodynamic changes to flow within the coronary artery. Figure 4.8 is the segmented LCA model showing various views of the position of the plaques at the left coronary artery. Double arrows indicate that rectangle is an effective plaque location (EPL). 4.10.3 Generation of Computational Models The surface of LCA model with and Figure 4.8 Plaque distribution in left coronary artery Model without plaques was prepared by using Blender version 2.48 (Blender Institute, Amsterdam, Netherlands). A gentle B-spline smoothing technique was applied between the left main trunk and side branches to reduce any potential nonphysical behavior induced by sharp edges. The surface models consisting of plaques and normal coronary arteries were converted into solid models and saved in “STL format” for the additional creation of meshing elements. Both models were used to create hexahedral and tetrahedral meshes to perform the CFD simulations. The hexahedral mesh configuration for the LCA model without plaques was 949,289 elements and 1,062,280 nodes, while the hexahedral mesh configuration for the LCA model with plaques was 928,311 elements and 1,041,936 nodes. The tetrahedral mesh configuration was 15,519 nodes and

Z. Sun, R. J. Winder, B. E. Kelly, P. K. Ellis, and D. G. Hirst, “CT virtual intravascular endoscopy of abdominal aortic aneurysms treated with suprarenal endovascular stent grafting,” Abdominal Imaging, vol. 28, no. 4, pp. 580–587, 2003. 100 Z. Sun, R. J. Winder, B. E. Kelly, P. K. Ellis, P. T. Kennedy, and D. G. Hirst, “Diagnostic Value of CT Virtual Intravascular Endoscopy in Aortic Stent-Grafting,” Journal of Endovascular Therapy, 2004. 101 Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 99

86

78,618 elements. The meshes were generated using ANSYS ICEM CFD®, with details having been described in previous studies102. 4.10.4 Application of Physiological Parameters In order to ensure that our analysis reflects the realistic simulation of in vivo conditions, realistic physiological boundary conditions were applied for 3D numerical analysis. The transient simulation was performed using accurate hemodynamic rheological and material properties, as described in a previous study103. Pulsatile velocity was applied as an inlet boundary condition at the left main stem, and a zero pressure gradient was applied at the left anterior descending and left circumflex outlet boundaries104. Appropriate rheological parameters were applied with a blood density of 1060 kg/m3 and blood viscosity of 0.0035 Pa. The blood flow was assumed to be laminar and a no-slip condition was applied at the walls. Plaque was assumed to be a rigid body. Blood was assumed to be a Newtonian and incompressible fluid. In addition, the comparison of WSS between Newtonian and non-Newtonian models has been considered, especially at the stenotic locations. A non-Newtonian blood model was simulated using the generalized power law as

μ = λ(y ) y   λ(y ) = μ  + Δμ exp − 1 +     n(y ) = n  + Δn exp − 1 +  

n(y ) −1

 − b  y    exp   y  a   

Eq. 4.6

 − d  y    exp      c  y 

where μ∞ = 0.035, n∞ = 1.0, Δμ = 0.25, Δn = 0.45, a = 50, b = 3, c = 50, and d = 4. Generalized power law model is defined as fits experimental stress-strain measurements over the range of strain rates, y´, 0.1 < y´ < 1000 s−1. 4.10.5 Performance of Computational Hemodynamic Analysis The Navier-Stokes equations were solved using the ANSYS CFX CFD package on a Microsoft Windows 32-bit machine, 6MB RAM with an Xeon W3505 2.53 GHz CPU. The CFD simulation was run for 80 time steps, representing 1.0 second of pulsatile flow, (0.0125 seconds per time step), with each time step converged to a residual target of less than 1×10−4 by approximately 100 iterations. The CFD solution was fully converged by approximately 8,000 time iterations per LCA model. The calculation time for each LCA model was approximately 2 hours. Flow velocity, cross-sections of velocity pattern, and pressure gradient were calculated and visualized using ANSYS CFD. Figure 4.9 represents the area of interest at the left coronary bifurcation and shows measurement positions of cross-sections of the models with and without plaques. The sectional planes were separated into 3 groups: Sections A–E, Sections F–J, and Sections K–O. The distance between sections in each group was approximately 0.5 millimeters. The parameter used to characterize the impact of plaques at the coronary bifurcation T. Chaichana, Z. Sun, and J. Jewkes, “Computation of hemodynamics in the left coronary artery with variable angulations,” Journal of Biomechanics, vol. 44, no. 10, pp. 1869–1878, 2011. 103 T. Frauenfelder, M. Lotfey, T. Boehm, and S. Wildermuth, “Computational fluid dynamics: hemodynamic changes in abdominal aortic aneurysm after stent-graft implantation,” Cardiovascular and Interventional Radiology, vol. 29, no. 4, pp. 613–623, 2006. 104 E.Wellnhofer, J. Osman, U. Kertzscher, K. Affeld, E. Fleck, and L. Goubergrits, “Flow simulation studies in coronary arteries-Impact of side-branches,” Atherosclerosis, vol. 213, no. 2, pp. 475–481, 2010. 102

87

on hemodynamic flow was calculated as the magnitude of local pressure gradient, which is defined as 2

 p   p   p  PSG =   +   +    x   y   z  2

2

Eq. 4.7

where p is the pressure in the area of interest. The local PSG is calculated by taking the time derivative of the local pressure. Finally, the value of PSG oscillated in relation to the percentage of plaques in the coronary lumen. 4.10.6 CFD Results of the Left Coronary Artery The realistic left coronary artery models with plaques and without plaques were successfully performed with CFD analysis under in vivo physiological conditions during the systolic and diastolic phases. Peak systolic velocity and pressure were reached at a time of 0.4 sec, and diastolic phase was reached at a time of 0.7 sec during the cardiac cycles, respectively. The analysis demonstrates a strong relationship between hemodynamic change and plaques at the left coronary artery. Figure 4.9 The EPL Posterior View at left 4.10.6.1 Cutting Plane Visualization Coronary Artery Flow velocity was visualized inside the LMS at sections A–E, as shown in Figure 4.9. Flow patterns in both the pre and post stenotic cases were similar to those observed in Sections A and B (velocity ranged from 0 to 17.43mm/s). However, the flow velocity increased in Sections C–E (velocity ranged from 23.96 to 30.50mm/s), at the location of plaques during the systolic peak. In addition, the flow pattern was affected by the presence of plaques, which started from sections A–E as observed in the post stenotic region, during the diastolic phase, with velocity increasing from 28.32 to 30.50mm/s. For cutting views of sections F–J and K-O., please refer to105.

Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 105

88

(a) post - plaque

(c) post - plaque

t = 0.4 s

t = 0.7 s

(b) pre - plaque

(d) pre - plaque

t = 0.4 s

t = 0.7 s

Figure 4.10

Flow velocity observed in pre and post plaque simulated models

4.10.6.2 Wall Shear Stress (WSS) Comparisons Analysis of WSS was particularly focused at the stenotic locations with comparison of non-Newtonian and Newtonian fluid models. Figure 4.11 compares WSS with different fluid viscosities at the left coronary model with presence of plaques. Comparison of WSS between non-Newtonian (a,c) and Newtonian (b,d) models observed in realistic coronary artery with presence of plaques. WSS contour values ranged from 0 Pa to 3.50 Pa as observed in both fluid viscosity models. WSS was different due to presence of plaques at LMS branch at peak systolic phase, ranging from 0.50 Pa to 1.75 Pa with non-Newtonian model and ranging from 0.50 Pa to 1.0 Pa with Newtonian model (Figure 4.12-(b)). Similar results of WSS values ranging from 1.50 Pa to 3.50 Pa with both viscosity models (Figure 4.12 (c-d)) were found at diastolic phase at plaques positions in LMS branch. WSS changes at stenotic locations in LAD were compared at peak systolic phases, ranging from 0.50 Pa to 1.0 Pa with non-Newtonian model (see Figure 4.12 (a)) and from 0.50 Pa to 0.75 Pa with Newtonian model (Figure 4.12 (b)). WSS values at plaques positions in LAD were compared at diastolic phases, ranging from 1.50 Pa to 3.50 Pa with non-Newtonian model (Figure 4.12 (c)) and from 1.50 Pa to 3.25 Pa with Newtonian model (Figure 4.12 (d)).

89

4.10.7 Discussion This study shows that coronary plaques produce a significant impact on the subsequent flow changes in the coronary artery, in addition to the local hemodynamic interference due to the presence of plaques. This is clinically important as further potential effects could result from the plaques’ interference, leading to adverse effects on the coronary artery, such as lumen stenosis or worsening of atherosclerosis. It is well known that plaques most commonly form in the coronary bifurcation and coronary angulation, and that this is an important factor that has been found to be related to the development of atherosclerosis, as confirmed by our and other studies. Multi slice CT angiography and intravascular ultrasound have been widely used to detect and characterize plaques in the coronary arteries. Despite promising results having been achieved with imaging modalities, the limitations of these techniques were restricted to image visualization and identification of coronary lumen changes due to presence of plaques, and no information is available about the interference of plaques with blood flow. In contrast, CFD overcomes those limitations by enabling the analysis of coronary blood flow and rheological factors. This study investigated two important factors: PSG and flow velocity and qualified the impact of Figure 4.11 Cross-sectional views of A–E at the left main plaques on flow changes to the stem coronary arteries. The static wall pressure does not reflect the velocity profile from the flow axis to the blood wall. In the clinical situation, the PSG magnitude has been used to judge the risk of severity of plaques. The highest PSG area may be relevant to potential coronary plaque rupture. In this study, the CFD analysis of the LCA with presence of plaques showed that the highest PSG was displayed in the locations at both LMS and LAD where plaques were simulated (see Figure 4.11), with measured PSG value ranging from 743.21 to 800 kg/m2s2. The presence of plaques in the coronary artery is responsible for obstructing blood flow to the myocardium, consequently affecting the flow velocity. Moreover, plaques influencing hemodynamic change may lead to the further distribution of plaques. Since velocity is the main component of local WSS and acts in the same direction as local WSS, which means that flow velocity is low when the WSS is low, as observed in a previous study, our analysis in this study has proposed explicitly hemodynamic changes inside the LCA surrounding the plaque locations the so-called Effective Plaque Location (EPL). In Sections A–E, we found that the flow velocity fluctuated in post-stenotic regions during cardiac cycles, and this could lead to abnormalities at the coronary wall, responsible for atherosclerosis. In Sections I–L , flow recirculation occurred, and the region of low velocity was observed within a short distance from the plaques. Consequently, plaques could generate an effect that spread into an area of low flow velocity as demonstrated in Sections I–L, matching with an area of low velocity in Error! Reference source not found., with measured low velocity value ranging from 0 to 2.18mm/s. This is confirmed by our previous analysis showing that progression of plaques developed at a low flow region. Our analysis provides insight into the effect of plaques on subsequent coronary flow changes although further studies are needed to verify our preliminary findings. WSS

90

in non-Newtonian model was found to be similar to that observed in Newtonian model at plaques locations although more details were demonstrated in non-Newtonian model, as shown in Figure 4.12. The effect of plaques in left coronary is obviously shown in Newtonian model, and this is adequate for analysis of the plaque effect. The comparison of WSS between different viscosity models is confirmed by previous studies. A non-Newtonian model was simulated using the generalized power law as it has been reported to produce similar WSS effects to Newtonian model on coronary flow changes106.

(a) non-Newtonian

(c) non-Newtonian

t = 0.4 s

t = 0.7 s

(b) Newtonian

(d) Newtonian

t = 0.4 s

t = 0.7 s

Figure 4.12

Comparison of WSS between non-Newtonian and Newtonian Models Observed in Coronary Artery with Presence of Plaques

Thanapong Chaichana, Zhonghua Sun, and James Jewkes, ”Computational Fluid Dynamics Analysis of the Effect of Plaques in the Left Coronary Artery”, Hindawi Publishing Corporation Computational and Mathematical Methods in Medicine Volume 2012, Article ID 504367, 9 pages doi:10.1155/2012/504367. 106

91

4.10.8 Limitation There are some limitations in our study that should be addressed. Firstly, realistic left coronary models, both pre and post-stenotic, were assumed to have a rigid wall rather than elastic wall; therefore, the simulation does not fully reflect the realistic physiological situation as the coronary wall moves during cardiac cycles. Secondly, the assumption of a Newtonian blood model becomes important especially in low flow and low wall shear stress regions. Nevertheless, a previous study has shown that the assumption of a Newtonian model is reasonable in this configuration. Thirdly, the realistic plaques position may be affected by left coronary side branches that have not been evaluated in this study. Thus, future studies will use coronary models with a more realistic idealized geometry, extended to evaluate the effect of side branches. In conclusion, we studied the effect of simulated plaques in the realistic left coronary artery on hemodynamic changes at the locations of plaques, as well as pre and post-stenotic regions inside the coronary artery. There is a direct effect of plaques in the left coronary artery on hemodynamic changes such as recirculation flow, low flow velocity regions, wall shear stress, and wall pressure gradient, indicating the potential for plaques to rupture, causing atherosclerosis. Further studies focusing on the realistic plaque’s effect on coronary side branches should be performed to verify our results107.

107

See previous.

92

5 Mesh Free Methods for CFD While algorithms have seen great advances in CFD, mesh generation methods has lagged behind, creating a computational bottleneck. For industry and government looking to impact current and future products with simulation technology, mesh generation imposes great challenges. Many generation procedures lack automation, requiring many man-hours, which are becoming far more expensive than computer hardware. More automated methods are less reliable for complex geometry with sharp corners, concavity, or otherwise complex features. Most mesh generation methods to date require a great deal of use expertise to achieve proper stretching, resolution, and structure108. The motivation behind meshless methods lies in releasing the burden of mesh generation. Since the application of computational methods to real world problems appears to be paced by mesh generation, alleviating this bottleneck potentially impacts an enormous field of problems. It is not clear at this point how effective meshless methods will be at alleviating meshing problems. While a rigid mesh is not required, sufficiently dense point distributions are still required. Moreover, points must be grouped locally to form clouds. Obtaining optimal clouds for different methods is also a nontrivial problem. However, recent progress in the area of point distribution and cloud generation by Löhner and others 109-110 has shown great promise is this area. Several of the most notable meshless methods are: • • • • • •

Smooth Particle Hydrodynamics (SPH) Mesh free Local Petrov-Galerkin (MLPG) Methods based on Radial Basis Functions (RBF) Finite Point Methods (FPM) Mesh free Boundary schemes Reproducing Kernel Particle Method (RKPM)

These methods are also summarized in works by Liu111 and Liu and Gu112.

5.1 Smooth Particle Hydrodynamics (SPH)

The method of SPH, introduced by Monaghan113, makes use of an integral representation of a function at a point given a set of surrounding points, called a kernel approximation. It uses no mesh, and points are free to move past one another consistent with a Lagrangian approach. While SPH was first developed to handle astrophysical phenomena in open space, the method was later applied to structures, fracture simulation, fluid flow, and other fields. Monaghan 114 showed that the SPH method with artificial viscosity could accurately capture shock waves in one-dimensional shock tube problems. Methods based on an SPH formulation are well-suited for problems of infinite domain in which the problem size is not know in advance. Aaron Jon Katz, “Meshless Methods for Computational Fluid Dynamics”, A dissertation submitted to the department of aeronautics and astronautics and the committee on graduate studies of Stanford university in partial fulfillment of the requirements for the degree of doctor of philosophy, January 2009. 109 R. Löhner and E. Oñate. An advancing front point generation technique. Communications in Numerical Methods in Engineering, 14:1097–1108, 1998. 110 R. Löhner, C. Sacco, and E. Oñate. A general advancing front technique for filling space with arbitrary objects. Int. J. Numerical Meth. Engineering. 61:1977–1991, 2004. 111 G. R. Liu. Mesh Free Methods: Moving Beyond the Finite Element Method. CRC Press, 2003. 112 G. R. Liu and Y. T. Gu. An Introduction to Mesh free Methods and Their Programming. Springer, 2005. 113 J. J. Monaghan and R. A. Gingold.”Shock simulation by the particle method SPH”, Journal of Computational Physics, 52:374–389, 1983. 114 See previous. 108

93

While SPH has become popular for intensely dynamic problems in which a static or even dynamic mesh may not properly resolve relevant physics, certain implementation difficulties are inherent in the method. These difficulties include the selection of a proper domain of influence with weighting functions, efficient nearest neighbor particle searching, and the determination of a smoothing length for force computations at each particle. The Reproducing Kernel Particle Method (RKPM), introduced by Liu, Jun and Zhang115, is very similar to the SPH method in that it uses a finite integral representation to discretize the governing PDEs. However, RKPM adds a correction function to the base kernel approximation, improving the accuracy especially near boundaries116. The RKPM method has been applied to fluids, structures, and acoustics. Lesoine and Kaila117 used RKPM to compute aero elastic effects of aircraft with large control surface deflections. Zhang, Wagner, and Liu118 showed that RKPM was well suited for domain decomposition for large-scale parallel computing. 5.1.1 Mesh free Local Petrov-Galerkin The MLPG method has arisen from the finite element community and is based on the weak form of a given PDE. While the use of the weak form of PDEs relaxes consistency requirements of field variable approximation, many algorithms in CFD bypass the rigorous use of weak forms. Weak forms require the use of numerical integration since they satisfy global integral forms of the governing equations. Numerical integration, along with other rigorous aspects of weak forms makes them computationally inefficient compared with simple FDM or FVM approaches. Jameson119 showed the equivalency of one FVM scheme with a Galerkin method, most development in CFD has been based on strong forms of the governing equations, which lead to simple and efficient conservative schemes. Nonetheless, an immense mathematical foundation has been developed based on weak forms used for a variety of FEM applications. Developed by Atluri and others 120-121, the MLPG method is based on a Petrov-Galerkin formulation in which weight and trial functions used in the weak form of the equations need not be the same. This gives the method a “local” nature in which the integral in the weak form is satisfied over a local domain. The MLPG method thus requires a local “background grid” to perform the integral as demanded by the weak form. However, the integral is performed locally, relieving the need for a global background integration as is used in related methods. The local background grid may be simple shapes, such as circles or squares. By all practical measures, MLPG is essentially meshless. Approximation of the field variables for the MLPG method is constructed using a moving least squares approach. Least squares representations of a function do not pass through the discrete sampling points of the function. Instead, they construct a smooth representation which minimizes the error of approximation. This fact has posed some difficulties in obtaining accurate and stable boundary conditions for the MLPG approach. The MLPG scheme is very general and has been applied to various problems. Specific to fluid mechanics, it have used MLPG to solve the incompressible W. K. Liu, S. Jun, and Y. F. Zhang. ”Reproducing kernel particle methods”, International Journal for Numerical Methods in Fluids, 20:1081–1106, 1995. 116 F. C. Gunther and W. K. Liu. “Implementation of boundary conditions for meshless methods”, Computer, Methods Appl. Mech. Engineering, 163:205–230, 1998. 117 M. Lesoinne and V. Kaila. “Meshless aero-elastic simulations of aircraft with large control surface deflections”, AIAA paper 2005-1089, AIAA 43rd Aerospace Sciences Meeting and Exhibit, Reno, NV, January 2005. 118 L. T. Zhang, G. J. Wagner, and W. K. Liu. “A parallelized mesh free method with boundary enrichment for largescale cfd”, Journal of Computational Physics, 176:483–506, 2002. 119 A. Jameson, T. J. Baker, and N. P. Weatherill. “Calculation of inviscid transonic flow over a complete aircraft”, AIAA paper 1986-0103, AIAA 24th Aerospace Sciences Meeting, Reno, NV, January 1986. 120 S. N. Atluri and T. Zhu. “A new meshless local petrov-galerkin (mlpg) approach in computational mechanics”, Computational Mechanics, 22:117–127, 1998. 121 S. N. Atluri, H. G. Kim, and J. Y. Cho. “A critical assessment of the truly meshless local petrov-galerkin (mlpg) and local boundary integral equation (lbie) methods”, Computational Mechanics, 24:348–372, 1999. 115

94

Navier-Stokes equations used an up winding scheme for stabilization of the convection operator in the stream wise direction. 5.1.2 Mesh free Methods Based on Radial Basis Functions Radial basis functions are functions which have no preferred direction, but only depend on norms in space. Most often, the Euclidean distance is used as the norm. Common RBFs include Gaussians, thin plate splines, and multi-quadrics. In general, RBFs are smooth and continuously differentiable. When used for interpolation purposes, RBF approximations are constructed such that they pass through data points exactly. It is difficult to prove any order of accuracy of such approximations since RBFs are not based on Taylor series or polynomial expansions. While RBFs have been widely used in scattered data interpolation, their application to the solution of PDEs is relatively new. The symmetric and un-symmetric forms were compared independently by and compared an RBF method to the finite element method in terms of accuracy and efficiency, showing improved accuracy of the RBF method over FEM. Sharan has used the popular multi-quadric RBFs to solve elliptic PDEs. In a similar work, [Sarler] formulated a solution method for diffusion problems based on RBFs. In a more general work, integrated the theory of Galerkin methods with radial basis functions. More recently, [Divo and Kassab] have used RBFs to model convective viscous flows and heat transfer problems. [Chinchapatnam] has used a localized RBF method to compute incompressible viscous flows. Radial basis methods for compressible flows are much less common, however Shu has recently proposed such a method based on an upwind approach. 5.1.3 Finite Point Methods By far, the most prevalent meshless schemes for CFD have been the so-called finite point methods. Finite point methods are usually based on the strong form of the governing PDEs and have given rise to several variants. In general, FPMs are based on least squares fitting of functions to discrete points. These approximate functions form the basis of discretization methods for PDEs. Least squares techniques have been widely used in traditional CFD methods as a means of reconstructing high order solutions, as discussed by [Mavriplis]122. However, the use of least squares as the primary mechanism for PDE discretization in the meshless sense is relatively new. Finite point methods were originally derived as generalizations of FDM for irregular point distributions by [Chung]123. Finite point methods may be categorized into two main classes: methods derived from Taylor series, and methods based on polynomial basis functions. Actually the Taylor series approach is a specific case of a polynomial method in which the approximated function is constrained to pass through the local cloud center. The Taylor approach is intuitive and has formed the basis for many schemes, including the Least Squares Kinetic Upwind Method (LSKUM). Other approaches based on Taylor series expansions includes the order of accuracy of the Taylor method for an upwind scheme. The methods based on polynomial basis functions are equally numerous as the Taylor based methods. [Batina] was one of the first to use a polynomial basis in conjunction with least squares to compute derivatives for the Euler and Navier Stokes equations. He used an unweighted least squares approach. A similar method was proposed a few years later by [Liu and Su]. Others developed a more rigorous method based on polynomial basis functions. Their method incorporated different least squares weighting methods to improve the accuracy of derivatives and formulations for higher order methods. They applied their method to subsonic compressible inviscid and viscous flows. [Löhner and others] extended the method of Oñate to compressible aerodynamic applications with shocks in three dimensions. They implemented their scheme with the van Leer approximate Riemann solver , gradient reconstruction for high resolution, and limiters to capture shocks. D. J. Mavriplis. “Revisiting the least-squares procedure for gradient reconstruction on unstructured meshes”, AIAA paper 2003-3986, AIAA 16th Computational Fluid Dynamics Conference, Orlando, FL, June 2003. 123 K. C. Chung. “A generalized finite-difference method for heat transfer problems of irregular geometries”, Numerical Heat Transfer, 4:345–357, 1981. 122

95

5.1.4 Meshless Boundary Schemes Many of the methods discussed above have been used to enforce boundary conditions for embedded boundary systems. Embedded boundaries arise with the use of nobody-conforming grids, such as Cartesian grids. Meshless methods have been used in place of cut cells and other related methods. One of methods used is polynomial least squares method to compute inviscid slip boundary conditions using embedded Cartesian meshes. They presented encouraging results for two and three dimensional inviscid test cases. It has been implemented meshless embedded boundary conditions for high Reynolds number viscous flows using the concept of a sub-grid to resolve boundary layers. The sub-grid adds additional resolution near the surface, providing points on which to perform meshless computations. All these methods appear to provide attractive alternatives to Cartesian cut cells or other methods of embedded boundary conditions124.

5.2 Solution Procedure for Mesh free Methods 125 The procedure of mesh free methods consists of four basic steps: • • • •

Domain representation Function approximation Formation of system equations Solving the global equations

5.2.1 Domain representation First, the domain and its boundary is modeled (not discretized) using sets of arbitrarily distributed nodes (see Figure 5.1) in the domain and its boundary. The nodal distribution is usually not uniform. The density of nodes depends on the accuracy requirement of the analysis. Because the nodes carry the values of a field variable (e.g. density, velocity, etc.), they are often called field nodes. Further in the text, a field variable will be referred to as a field function.

Figure 5.1

Domain representation

5.2.2 Function Approximation The field function u at any point at x = (x, y) within the domain is approximated using the values at its nodes within the “small” local domain of the point x, i.e. n

u(x) =  i (x)u i i =1

Eq. 5.1

Where n is the number of nodes included in a local domain of the point at x, u i is the nodal field function at the i th node in the local domain, and ϕi (x) is the shape function of the i th node. The “small” local domain of x will be called the support domain of x and denoted Ωx. The size of support domain defines the number of field nodes approximating x. Some possible shapes of support domains are shown in Figure 5.2 where spherical is the most common one.

For excellent survy of literature in “Meshless methods”, see 77. P. Niedoba, L. Cˇerma, and M. J´ıcha, Meshfree methods for computational fluid dynamics, EPJ Web of Conferences 45 01068 (2013), DOI: 10.1051, epjconf/201345 01068. 124 125

96

5.2.3 Formation of System Equations System equations can be formulated using the shape functions and strong or weak formulation126. These equations are assembled into the global system matrices for the entire problem domain. For static problems, the global system equations are a set of algebraic equations. For general dynamics problems, it is a set of differential equations. 5.2.4 Solving the Global Equations The last step depends on the type of equations (algebraic, differential, etc.). Note that the global equations for computational fluid dynamics problems are basically nonlinear. Figure 5.2

5.3 Method of Smooth Particle Hydrodynamics (SPH)

Different type of Support domains

The smoothed particle hydrodynamics method belongs to basic mesh free methods. It is used for solving partial differential equations. The SPH is basically an interpolation method. The interpolation is based on the theory of integral interpolants using kernels that approximate a delta function. The fluid mass is lumped into smoothed blobs that are moved using Newton’s second law directly, without an underlying mesh. In SPH the fluid is modeled as a collection of smooth “blobs” or particles as depicted in Figure 5.3. A system of ordinary differential equations is produced after approximation of unknown functions (field function) and their spatial derivatives. This system is Figure 5.3 1-D SPH Characterization most often solved by explicit numerical methods. 5.3.1 Formulation Function approximation of the field function u(x) is based on an integral representation of the function and is given by the equation

 f(x) =

 f( )W(x − ξ, h)dξ

Ωx

Eq. 5.2

Where W(x-ξj , h) is the weight function (i.e., smoothing function, kernel function), h being the smoothing length, which defines the size of the support domain Ωx, i.e. the smoothing length determines the number of particles approximating the function at x. Eq. 5.2 is usually referred to as kernel approximation, or SPH approximation of function f(x). For practical calculation, Eq. 5.2 must 126

See 94.

97

be discretized as follows n

n

mj

j =1

ρj

 f(x)  =  f( j )Vj W(x − ξ j , h) =  j =1

f( j )W(x − ξ j , h)

Eq. 5.3

Where mj and ρj are mass and density of the jth particle in Ωx (i.e., VJ = mj/ρj is the volume of j- particle). Eq. 5.3 is called a particle approximation of field function f(x). Note that the approximation ( Eq. 5.3) corresponds to the approximation (Eq. 5.1) introduced for a general mesh free method. The shape function in this case has the form of

 j (x) = W(x − ξ, h) j

mj

Eq. 5.4

ρj

Approximation of the spatial derivatives of the field function can be obtained by replacing the function f(x) in Eq. 5.2 with its spatial derivative ∇f(x). Using the per-parts, the Green theorem and a discretization we obtain a particle approximation of the spatial derivative of the field function in the form of n

mj

j=1

ρj

 f(x)  =  f(ξ j ) x W(x − ξ j , h)

Eq. 5.5

Where ∇xW(x −ξ j , h) is the spatial derivative of the weight function with respect to the variable x. We can observe that an approximation of the spatial derivative of a field function is determined using only field function values and derivatives of the weight function. In the same fashion we obtain the Laplacian as: n

mj

j =1

ρj

  f(x)  =  f( j ) 2x W(x − ξ j , h) 2

Eq. 5.6

5.3.2 Smoothing Kernels The use of different kernels in SPH is analogue to using different difference schemes in finite difference methods, thus the choice of smoothing kernel for a specific problem is significant. The derivatives of the smoothing kernels have an important impact for different SPH estimations, but we will now focus on the kernels and their required properties. It is required that a suitable kernel must have the following two properties,

 W(x − ξ, h)dξ = 1

and

Ωx

 x =0 lim h →0 W(x -  , h) =  (x -  ) =   0 otherwise Eq. 5.7

Eq. 5.7 states that the kernel must be normalized, and that the unit integral ensures that maxima and minima are not enhanced. The kernel must also be positive to ensure that it is an averaging function.

98

If the kernel is even, then rotational symmetry is enforced, which is useful to ensure invariance under rotations of the coordinate system.

W(x - ξ, h)  0 and W(x - ξ, h) = −W(x - ξ, h)

Eq. 5.8

If these conditions are met, the interpolation is of second order accuracy127 that is the error of approximating is 2nd order or better. It is also suggested that a suitable kernel should have a limited or compact support radius, in order to ensure zero kernel interactions outside the computational

Gaussian Kernel

Figure 5.4

The choice of Different Smooth Kernel in 1D (h=1)

range of the radius. We use the kernel width h as the compact support radius for all smoothing kernels, which implies W(x-ξ, h) =0, r>h. The first golden rule of SPH states that if a new interpretation of an SPH equation is to be found, it is always best to assume the kernel is a Gaussian128. The isotropic Gaussian kernel in n dimensions is given by

W(x − ξ, h) =

1

(2πh )

2 3/2

e

 r2 − 2  2h 

   

,

h0

Eq. 5.9

Which is depicted in Figure 5.4 - left. Even though a Gaussian kernel has very nice mathematically properties, it is not always the best kernel to use, e.g. it does not have a compact support for our purpose, and it requires the evaluation of the expensive exponential function. There are other choices of kernels such as W1=piece wise cubic spline, W2=quadratic spline, and W3=exponential function which also shown in Figure 5.4 - right, where d = (ξ - ξj)/h.

J. J. Monaghan. “Smoothed Particle Hydrodynamics”. Annual Review of Astronomy and Astrophysics, 30, pp. 543-574, 1992. 128 See above. 127

99

5.3.3 Updating of Smoothing Length h To update h, we can use either the constant or as a variable. 5.3.3.1

Constant

➢ h too small, n too small, results no accurate ➢ h too big, local information smoothed out 5.3.3.2

Variable

➢ known at the beginning: hi0 ➢ updated solving:

Figure 5.5

Ghost Particles, Velocities are formed Symmetrically (slip wall)

 min m nj  n Dhin h in Dρ in h in n =− n = − n  m j  n − n . i W(ξ i − ξ j ) ρ Dt ρi N Dt ρ i N j=1 ρ j   i

Eq. 5.10

Where N is number of dimensions. It is ok for slow varying density, more complicated procedure or fast expansion/contraction (e.g. in gases)129. 5.3.4 Boundary Treatment The issue of boundary conditions is generally very difficult in the SPH method. We answer the question of properly defining the boundary condition that prevented particles from escaping out of the domain. Furthermore, we discuss consistency near the boundary of the domain (near boundary area)

Figure 5.6

Virtual Particles

5.3.5 Virtual Particles The first approach is the use of virtual particles. These particles are situated on the boundary and by repulsive force acting on the particles in the near boundary area (near boundary particles). Hence, virtual particles prevent an unphysical penetration through the boundary. (see Figure 5.6). Unfortunately, this approach violates the condition for C1 consistency of the SPH Figure 5.7 Example of a 1D task, particle j is Situated in the Near Boundary Area approximation in the near boundary area. This fact is due to the undesirable “cutting off” of the weight function support, see Figure 5.7. Thus, the appropriate weight function is not an

129

Remo Minero, “Mesh Free Methods for Fluid Dynamics Problems”, 17 Dec, 2003.

100

even function130. 5.3.6 Ghost Particles A much better way is to use ghost particles as a boundary condition. In contrast to virtual particles, this approach creates a dynamic wall that is constructed at each time step. Ghost particles are formed symmetrically (according to the boundary) to the near boundary particles as “twin” particles, see Figure 5.5. Using ghost particles ensures C1 consistency of the SPH approximation, because the shape functions of the near boundary particles can be even functions. 5.3.7 • • • • •

Summery and Recap131 Smoothed particle hydrodynamics is an interpolation method that can approximate continuously field quantities and their derivatives by using discrete sample points, called smoothed particles. Particles carry mass, m, position, x, and velocity, u, but can also hold SPH estimated quantities, e.g. mass-density, ρ, pressure, p, etc. The following relation between volume, mass, and mass-density applies, and can be used to determine the volume occupied by a particle, V=m/ρ. The following properties must hold for a smoothing kernel: being normalized, positive and even. We only use smoothing kernels with a compact support radius h. The basis formulation of SPH to approximate any quantity field and their derivatives. SPH is originally designed for compressible flow problems.

Readers are encourage to consult [Liu & Liu]132 for detailed information and recent trends in SPH methodology. 5.3.8

Case Study 1 - Lid Driven Cavity Problem Figure 5.8 Comparison with FDM with SPH for Lid Driven Cavity To validate, the bench mark case of the lid driven cavity is considered and the results are compared with FDM on the same number of particles for Re =10 and 41 x 41 particles (See Figure 5.8).

P. Niedoba, L. Cˇerma, and M. J´ıcha, Meshfree methods for computational fluid dynamics, EPJ Web of Conferences 45 01068 (2013), DOI: 10.1051, epjconf/201345 01068. 131 Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. 132 M.B. Liu · G.R. Liu, “Smoothed Particle Hydrodynamics (SPH): an Overview and Recent Developments”, Arch Comput Methods Eng (2010) 17: 25–76, DOI 10.1007/s11831-010-9040-7. 130

101

5.3.9 Case Study 2 - Two-dimensional Convection–Diffusion Problem A meshless Local Method of Approximated Particular Solutions (LMAPS) is used to analyze problem described by the convection diffusion equation by [Mužík & Holičková]133. The method solves the steady convection-diffusion equation with reaction term. The discretized system of equations is derived via interpolation procedure and radial basis functions (RBF). The solution of the equation is performed over simple geometry with non-uniform velocity field and results are presented in the article. The LMAPS method is capable to produce stable solutions with results comparable to the analytical solutions. The local method of approximated particular solution (LMAPS) was proposed by [Cheng et al.]134 and was applied to elliptic problems and non-linear problems135. In LMAPS the domain is covered by cloud of scattered nodes. In the work on LMAPS reported so far, the support of the any computational node is taken to be a simple subdomain in a shape of a circle though in theory the domain can be of any shape, with the computational node in the center of the circle. The most often used interpolation for field variables Figure 5.9 The diagram of global domain Ω, local support were the moving least-squares, domain Ωs of point xs, global points x and local point xi though some researchers used different schemes for interpolation of the field variable and gradients over the circular boundaries. The area of interest Ω with the boundary ∂Ω is covered by points within the area and also on the global boundary (see Figure 5.9). Consider a local circular (or any simple shape e.g. rectangle) sub-domain ΩS centered at every point s. This sub-domain is called support domain and using the points in a particular support domain any function can be expressed using just nodal values136.

5.4 RKPM Method

The reproducing kernel particle method belongs to the category of finite integral methods, and is a modification of the SPH method. This method adds the so-called correction function to the SPH formulation to ensure certain order of consistency. The particle approximation of the function f(x) is defined

Juraj Mužík, and Martina Holičková, “Two-dimensional convection–diffusion problem solved using method of localized particular solutions”, MATEC Web of Conferences · January 2017. 134 C.S. Chen, C.M. Fan,P.H. Wen, “Numerical Methods for Partial Differential Equations”, 28, 506–522, (2012). 135 C.S. Chen, M.A. Golberg, M. Ganesh, “A.H.-D. Computers and Mathematics with Application”, 359–378, (2002) 136 See 105. 133

102

n

mj

j =1

ρj

 f(x)  = 

f( j ) C(x,  j )W(x − ξ j , h) Eq. 5.11

where C(x,  j ) is correction factor 5.5 Lagrangian Description of Fluid Dynamics Using SPH

Interactive fluid dynamics is of essential interest in real-time applications, such as computer games or virtual surgery simulators. Using the smoothed particle hydrodynamics (SPH) method, a stable particle-based approach to solve the motion of interactive fluids using Lagrangian description. With focus on the simulation part we provide a thorough insight of the mathematical theory of particlebased fluids. The basic Eulerian formulation of an incompressible, isothermal fluid for 2-D flow with constant properties express as

  .u = 0 , ρ  + u.  u = − p + μ. ( u) + f  t 

Eq. 5.12

Where μ is the viscosity of the fluid, and f is the sum of external force-densities acting on the fluid, e.g. gravity. Using particles instead of a grid simplifies the equations significantly. We assume that the amount of particles is constant during the simulation, and by keeping the mass fixed for each particle, it implies that mass conservation is guaranteed, and that conservation of mass can be omitted. Figure 5.10 depicts a basic layout of a particle-based fluid, which has been reduced to two-dimensions for reasons of clarity. The particles are represented by the dots. The circles represent the volume of each particle. In the Lagrangian formulation of a fluid the particles completely define the fluid, which implies that the particles move with the fluid. Compared to the Eulerian view this means that any field quantity now depends on Figure 5.10 Lagrange particle-based time, t, only. The particles carry mass, position, and fluid structure in 2D velocity, and will hold smoothed quantity approximations obtained from SPH. The acceleration for a Lagrangian fluid particle becomes the ordinary time derivative of its velocity. This is why the total derivative term (D/Dt) is reduced to a simple d/dt in the Lagrangian view. The basic Lagrangian formulation of the Navier Stokes equations for an incompressible, isothermal fluid is given by

ρ

du =⏟ −∇p + μ∇2 u + ⏟f dt f

Eq. 5.13

finternal

external

, F = finternal + fexternal , ai =

dui Fi = dt ρi

Where ai is the particle acceleration, finternal denotes to pressure and viscous forces, and fexternal assigned to gravity. 5.5.1

Default Kernel

103

We learned about the first golden rule of SPH, and we also concluded that the isotropic Gaussian kernel was not fit to be used for our purpose. We need a default smoothing kernel with compact support for the inter-particle-based SPH computations required to solve for (8.1). Several suggestion discussed in 137 for SPH kernels. Among them are the B-Spline and Q-Spline kernels, where the QSpline is concluded to be the best kernel in terms of computational accuracy. However, the Q-Spline kernel requires the evaluation of the square root, which can be expensive if the kernel is often used. Instead we will use the 6th degree polynomial kernel suggested by138 as default kernel, which is given by

315 Wdefault (x, h) = 64π4 9

(

2 3

( (

) )(

  h2 − x  0  

)

0 x h x h

945 2 2 2 x h − x 32π2 9 945 2 2  2 Wdefault (x, h) = h − x 3h 2 − 7 x 9 32π2 Wdefault (x, h) =

2

)

Eq. 5.14

The default kernel and its derivatives in one dimension can be depicted as:

Figure 5.11

The default kernel and its derivatives in one dimension for h=1

The default kernel and its derivatives are used for all smoothed quantity field approximations, except for the internal fluid force fields. For further information regarding various smoothing kernel and its application, please consult139.

J. Hongbin and D. Xin. "On criterions for smoothed particle hydrodynamics kernels in stable field". Journal of Computational Physics, 202, pp. 699–709, 2005. 138 M. Müller, D. Charypar, and M. Gross. “Particle-Based Fluid Simulation for Interactive Applications”. Proceedings of 2003 ACM SIGGRAPH Symposium on Computer Animation, pp. 154-159, 2003. 139 Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. 137

104

5.5.2 Numerical Time Integration To simulate the fluid flow, each particle is advanced through time using a global fixed time step Δt, Eq. 5.15 is employed to compute the particle acceleration, and the new particle position is obtained from integrating the acceleration numerically. In this section three different integration schemes will be introduced. 5.5.2.1 The Implicit Euler Scheme The Implicit Euler scheme is actually a semi-implicit method, as it is only the position update that is implicit. Semi-implicit Euler is based on the explicit Euler scheme, which probably is the most common integration method. In explicit Euler the position and velocity are updated in parallel. The semi-implicit Euler is no longer independent of the position and velocity updates as

x t +Δt = x t + Δt u t +Δt Eq. 5.15

5.5.2.2 The Verlet Scheme The velocity update is the same, but the position update uses the result from the velocity update to predict the new position,

x t +Δt = 2x t − xt-Δt + Δt 2u t

Eq. 5.16

The Verlet scheme is one of the computationally fastest integrators and it is usually very stable, as the velocity is given implicitly and will not get out of sync with the position. However, collision responses are not trivial to handle, as it includes modifying positions rather than velocities. 5.5.2.3 The Leap-Frog Scheme The leap-frog integration has got its name from the fact that the velocities leap over the positions, and vice versa, as illustrated in Figure 5.12 where the horizontal line represents time t, and the

Figure 5.12

The leap-frog mechanism

subscripts on the positions and velocities u indicate the specific time. The integration structure is implicit Euler and yields to:

ut  where

u t −Δt/2 + u t + Δt/2 2

u t + Δt/2 = u t −Δt/2 + Δta t and u −Δt/2

1 = u 0 − ta 0 2

Eq. 5.17

105

In theory, a time integration scheme will follow Newton’s 1st law, but numerical dissipation can reluctantly damp the linear motion of the particles. Typically, this is not a problem in physics-based animation, because the damping can be explained as a small scale air resistance or friction. Especially the Verlet scheme is easily influenced by numerical damping. We have chosen not to introduce any explicit damping in the time integrators, due to the different ways integrators handle damping. We rely on the viscosity force to provide the necessary numerical damping140. 5.5.3 Collision Handling The small-scale working domain of interactive Lagrangian fluids is limited. A practical way of meeting a convincing environment of the fluid is to constraint the particle system within well-defined boundaries. Boundary containers, such as boxes, spheres, and capsules, are commonly used to constraint a fluid. When particles collide with a container they must stay inside its boundaries. Likewise, if particles collide with an obstacle, they may not penetrate or gain access to the interior of the object. Collision handling can be divided into two sub parts; collision detection and collision response. Further discussion is avoided here and interested readers are encourage to read141. 5.5.4 Case Study 1 – Comparison of Weakly Compressible and Incompressible SPH The comparative study for the Weakly Compressible (WCSPH) and Incompressible (ISPH) Smoothed Particle Hydrodynamics methods over an airfoil is investigated by [Shadloo, et. al,]142. WCSPH and ISPH simulation results are compared and validated with those of a finite element method (FEM). The quantitative comparisons of WCSPH, ISPH and FEM results in terms of Strouhal number, and velocity gradients on the airfoil boundaries as well as the lift and drag values for the airfoil geometry indicate that the WCSPH method with the suggested implementation produces numerical results as accurate and reliable as those of the ISPH and FEM methods. 5.5.4.1 Formulation of Problem The SPH method relies on the idea of smoothing field properties over a bounded domain through the devised as in Eq. 5.18 which is referred to as the kernel approximation to an arbitrary function f (ri). In fact, this arbitrary function can be any hydrodynamic transport property such as temperature, enthalpy, density, viscosity and so forth. Here, W(rijj, h) is a kernel function, the angle bracket hi denotes the kernel approximation < >, is the position vector defining the center point of the kernel function, rij is the magnitude of the distance vector between the particle of interest i and its neighboring particles j, d3(rj) is a differential volume element within the total bounded volume of the domain _, and the length h defines the support domain of the particle of interest. The SPH technique in Equation (8.18) assumes that the fields of a given particle are affected only by that of other particles within a cutoff distance of the particle of interest with a smoothing radius kh where k is a coefficient associated with the particular kernel function. A smoothing kernel function is a piecewise spline that should satisfy several conditions: the normalization, the Dirac-delta function, compactness, spherical symmetry, and positive and even function properties. A thorough discussion on the details of these attributes of the kernel function can be found in143 and the references therein. In SPH literature, it is possible to find different forms of piecewise smoothing kernel functions possessing the above-listed properties such as Gaussian, cubic or quantic kernel functions. Throughout the present simulations, the compactly supported two dimensional quantic spline kernel. Micky Kelager, “Lagrangian Fluid Dynamics Using Smoothed Particle Hydrodynamics”, January 9, 2006. See above. 142 Mostafa Safdari Shadloo, Amir Zainali, Mehmet Yildiz, and Afzal Suleman, “A robust weakly compressible SPH method and its comparison with an incompressible SPH”, Int. J. Numer. Meth. Engng,(2011). 143 Liu MB, Liu GR. Smoothed Particle Hydrodynamics (SPH): an overview and recent developments. Archives of Computational Methods in Engineering 2010. 140 141

106

     f( r1 )  f( r1 )   f( rj ) W( rij , h) d 3 rj

where

Ω

 (3 − s ij ) 5 − 6(2 − s ij ) 5 + 15(1 − s ij ) 5  5 5  7  (3 − s ij ) − 6(2 − s ij ) W( rij , h) =  478π7 2  (3 − s ij ) 5  0 

if 0  s ij  1 if

1  s ij  2

if 2  s ij  3 if s ij  3

and s ij = ri j / h Eq. 5.18

5.5.4.2 Results Figure 5.13 compare the velocity contours of ISPH (upper), FEM (middle) and WCSPH (lower) for the angles of attack of 5 and 15 degrees (contours show the velocity magnitude, m/s) for the Re = 570. Similar to the previous benchmark problem, both WCSPH and ISPH results are in good agreement with those of the mesh dependent FEM technique. In all simulations, the results of WCSPH are as accurate as the ISPH ones. The figures further illustrate that the proposed algorithm is also very successful in simulating the flow around the airfoil geometry with different angles of attack across the flow field144. 5.5.5

Case Study 2 - Dam Break Water Flow using Figure 5.13 Comparison of ISPH (upper), FEM (middle) and Lagrangian Description WCSPH (lower) velocity contours for the angle of attack of 15 The analysis of fluid flow is more degrees at Re = 570 (Courtesy of Shadloo105) an area of interest for physicists than computer scientists. However, in order to be convinced that the Lagrangian fluid method can produce realistic fluid motion we will examine the fluid flow. We will study the velocity flows produced by the dam-break problem for the water. In a classic dam-break problem the fluid is constrained inside a dam, and when the fluid is at rest the dam is broken, or the barricade that constrains the fluid is removed. The fluid now flows freely and often collides with a vertical wall. Frames from the dam-break of water simulated by particles are depicted on Figure 5.14 where the 144 Mostafa Safdari Shadloo, Amir Zainali, Mehmet Yildiz,

and Afzal Suleman, “A robust weakly compressible SPH method and its comparison with an incompressible SPH”, Int. J. Numer. Meth. Engng,(2011).

107

flow of water simulation time interval is 0.1s between each frame, from left to right, top to bottom. This is just a survey of how the visible water particles flow in the dam-break problem. Frames from the dam-break of water simulated by 2250 particles.

Figure 5.14

Dam-Break Flow of water

5.5.6 Case Study 3 - Dam Break using MLPG-RBF and Shallow Water Equations The application of the meshless local Petrov-Galerkin (MLPG) method to solve the shallow water equations (SWE) is investigated by [Mužík1 and Holičková]145. The shallow water equations (which also called the de Saint-Venant equations) are used to describe flow behaviors in bodies of water where the horizontal length scales are much greater than the flow depth, therefore, the 3D problem can be assumed as 2D. This localized approach is based on the meshless weak formulation with the use of radial-basis functions (RBF) as the trial functions. In this work, the numerical model is applied

Figure 5.15

Geometry and Water surface profile of the 2D dam-break problem at t =7.2 s.

Juraj Mužík1, Martina Holičková, ”Meshless simulation of dam break using MLPG-RBF and shallow water equations”, MATEC Web of Conferences 117, 00127 (2017). 145

108

to simulate a dam-break problem as one of most descriptive benchmark problems for SWE. As a result, the adopted meshless method not only shows its algorithm applicability for class of problems described by SWE, but also brings more efficiency than several conventional mesh-based methods. The problem models a partial dam-break for a rapid opening of a sluice gate with a non-symmetric breach and its ability to simulate discontinuous flows. The computational domain is a 200 m by 200 m region. A dam is located in the middle of the domain with 10 m thickness. The initial water depth is 10 m on one side and 5 m on the other side of the dividing wall. At time t = 0, the dam fails, and the water is released through the 75 m wide non-symmetric breach, as shown in Figure 5.15 - (left). When the downstream water depth is 5 m, the flow is subcritical everywhere. The boundary conditions at x=0 and x=200 m are assumed to be transmissivity and all other boundaries are considered as reflective. At the instant of the dam break, water is released through the breach, forming a positive wave propagating downstream and a negative wave spreading upstream. We compare our results by at t = 7.2s (Figure 5.15 - right), when the waves have not yet reached all the boundaries, with least-squares finite-element method (LSFEM). The left moving positive wave and right moving negative wave are both well resolved. The results were confirmed more stable to capture the fine details of the flow. The behavior of the numerical scheme is in satisfactory agreement with computed results of these researches. 5.5.7 Case Study 4 - SPH Method for Evaporating Multiphase Flows Because evaporation is encountered in many engineering applications, such as fuel droplets in engines, liquid sprays, and material processing, a numerical method to accurately predict liquid evaporation is of great importance. Common engineering models for predicting droplet evaporation assume that the liquid droplet is a point source with homogeneous properties . The primary concern of these models is the mass transfer rate without consideration of the gradient in the droplet or the liquid-gas interface. While such models are useful in engineering applications, advanced numerical methods are needed to reveal the details of the evaporation process. The dynamics of evaporating flows involves phase change and energy transfer at the liquid-gas interface, diffusion of vapor species in the gas phase, and multiphase flows with sharp interfaces. Because of the complexity of the evaporation problem, it is challenging to make a detailed numerical simulation. The main numerical challenges in simulating evaporating flows include the treatment of phase change and the sharp discontinuity of fluid properties at the liquid-gas interface. Phase change due to evaporation causes mass transfer from one phase to another phase. The discontinuity at the liquid-gas interface, of variables such as density ratio, also leads to numerical difficulties. The intent of [Xiufeng Yang & Song-Charng Kong]146 this work is to provide a numerical method, based on smoothed particle hydrodynamics (SPH), to simulate multiphase flows with evaporation. The SPH method is a Lagrangian mesh-free particle method. In SPH, a continuous fluid is discretized using SPH particles, which carry physical properties, such as mass, density, pressure, viscosity, and velocity. Since SPH is a mesh-free method, a smoothing kernel is introduced to connect the neighboring particles. The variables and their spatial derivatives are discretized in summations over particles. In the SPH method developed for this study, the SPH particles near the interface are allowed to change their mass to model the process of evaporation at the interface. The rate of mass change of SPH particles due to evaporation depends on the vapor mass fraction in the gas phase and the saturated vapor mass fraction at the interface. The saturated vapor mass fraction can be predicted by the (Clausius-Clapeyron) correlation. During the process of evaporation, the mass of a liquid SPH particle at the interface increases, while the mass of a gas SPH particle decreases. To constrain the mass of individual SPH particles, a particle will split into smaller particles if its mass is large enough or merge into a neighbor particle if its mass is small enough. 146 Xiufeng Yang And Song-Charng Kong, “Smoothed Particle

Flows”, Physical Review E 96, 033309 (2017).

Hydrodynamics Method for Evaporating Multiphase

109

5.5.7.1 Basic Formulations of the SPH Method In SPH, the value of a function f (r) at point ra can be approximated using the following integration:

f(ra )   f(r)W(ra − r, h) dV

Eq. 5.19

where W is a kernel function and dV is a differential volume element. The parameter h is referred to as a smoothing length, which determines the size of the integral domain. In this paper, the following hyperbolic-shaped kernel function in two-dimensional space is used:

s 2 - 6s + 6 0  s  1 1  W(s, h) = (2 - s)3 1 s  2 2  3h  0 2s 

Eq. 5.20

In the SPH method, a continuous fluid is discretized into properties, such as mass m, density ρ, velocity u, and viscosity μ. Then the integration of Eq. (5.17) is discretized in particle summation as follows

f(ra )   f(rb )W(ra − rb , h) b

mb ρb

Eq. 5.21 5.5.7.2 Evaporation of a Static Drop The evaporation of a static drop was simulated using the proposed SPH method. The initial radius of the drop is R0 = 0.15 mm. The initial temperature of the drop is 353 K. The drop was located at the center of a square computational domain, which was filled with gas. The length of the square was 1.2 mm. The initial temperature of the gas was 373 K. The temperature of the boundary was also 373 K, and did not change during the simulation. These temperatures were Figure 5.16 Snapshots of the Evaporating Drop at chosen in order to be consistent with and to different times using SPH allow comparisons with the conditions in the literature. The initial vapor mass fraction in the gas phase was zero. The vapor mass fraction of the boundary remained zero. The initial particle spacing was 0.02 mm. Figure 5.16 shows that the size of the drop decreased slightly. The decrease in the drop size, as compared with the result from a two-dimensional (2D) axisymmetric level-set method. It should be noted that the 2D circle used in this study corresponded to the cross section of a three-dimensional (3D) cylinder of infinite length, while the 2D axisymmetric circle used corresponded to a 3D sphere. 5.5.7.3 Evaporation of a Dynamic Drop Impacting on a Hot Surface The proposed method was also used to simulate the evaporation of a drop impacting a hot surface. The initial radius of the drop was R = 0.25 mm and the initial velocity of the drop was U = 2m/s. The height and length of the computational domain were 1.5 and 5.0 mm, respectively. The drop was

110

located at the center of the domain and was surrounded by gas. The initial temperature of the drop was 353 K. The initial temperature of the gas was 373 K. The temperature of the boundaries was also 373 K, and did not change during the simulation. The initial vapor mass fraction in the gas phase was zero. The vapor mass fraction of the boundary remained zero. The initial particle spacing was 0.02 mm. Figure 5.17 shows the evolution of drop impact on a hot surface. After the drop touched the surface, it spread and formed a film on the surface. At approximately 1.0 m/s, a tiny crown like structure was formed around the rim. Later, the crown merged with the film, and the film receded. Finally, the film reached an equilibrium size. Since the initial temperature of the drop was lower than the gas temperature, the heat transfer from the surrounding gas to the drop led to the decrease in the local gas temperature. However, the drop temperature also decreased slightly because evaporation consumed energy, as discussed earlier. When the drop spreads on the hot surface and forms a film, heat transfer from the hot surface to the film increased the temperature of the film. The intent of this paper was to present an SPH method to simulate evaporating multiphase flows. This method accurately models the process of evaporation at the liquid-gas interface and the diffusion of the vapor species in the gas phase. An evaporating mass rate was derived to calculate the mass transfer at the interface. To model the process of phase change from the liquid phase to the gas phase, mass was allowed to transfer from a liquid SPH particle to a gas SPH particle. Thus this proposed method, unlike the traditional SPH method, allows change in the mass of an SPH particle. Additionally, particle splitting and merging techniques were developed to avoid the large difference in the SPH particle mass.

Figure 5.17

Evolution of Dynamic Drop impact on a hot surface using SPH

5.5.7.4 Concluding Remarks In general, the results show that the method proposed in this paper successfully replicated the physical process of evaporating flows, such as heat and mass transfers and the diffusion of the vapor species. The example was to simulate the evaporation of a static drop–because of evaporation, the present SPH method predicts the decreases of both the temperature of the interface and the size of the drop. The last example was to simulate the evaporation of a drop impacting a hot surface. The temperature of the liquid-gas interface decreased at first because of evaporation, especially at the rim of the film. Then the temperature increased because of the heat transfer from the hot surface to the liquid. In summary, the results of this study indicate that the numerical method proposed in this paper can be successfully used to produce an evaporating flow simulation. Additional information

111

can be attained from [Yang & Kong]147.

147 Xiufeng Yang And Song-Charng Kong, “Smoothed Particle

Flows”, Physical Review E 96, 033309 (2017).

Hydrodynamics Method for Evaporating Multiphase

112

6 CFD Applications in Other Areas Recently CFD finds very wide application in different areas of science and engineering; some examples are148: ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢ ➢

Aerodynamics of aircraft and vehicles - Lift and Drag Hydrodynamics of Ships Power plant - Combustion in Internal Combustion Engines (ICE) and Gas Turbines Turbo machinery - Flows inside rotating passages, Diffusers etc. Electrical and Electronics Engineering - Cooling of Equipment Including Microcircuits. Chemical Process Engineering - mixing and separation and polymer molding. Marine Engineering - loads on off-shore structure. Environmental Engineering - Distribution of Pollutant and Effluents. Hydrology and Oceanography - flows in rivers, estuaries and oceans. Meteorology - Weather Prediction. Biomedical Engineering - blood flows through arteries and veins. Food Processing External and internal environment of buildings: wind loading, ventilation analysis and heating/cooling load calculations.

6.1 Food Processing CFD applications in food industry may assist in a better understanding of the complex physical mechanisms. [Schott]149, [Quarini]150 have reviewed the general application of CFD to the food processing industry. Moreover, other literatures are also available on specific CFD application areas such as: Clean-room design, Refrigerated transport, Static mixers, and Pipe flow. Since CFD technique can be of great benefit to the food processing industry, fast development has taken place in the past few years. CFD, as a tool of research for enhancing the design process and understanding of the basic physical nature of fluid dynamics can provide benefits to the food processing industry in many areas, such as Drying, Sterilization, Mixing, Refrigeration, Crystallization, Pasteurization and other application areas151. 6.1.1 Drying Drying is a common food manufacturing process. The drying rate is a strong function of air flow or air velocity. Therefore, it is of great importance to know the air flow and velocity in the drying chamber, thus leading to know the areas of adequate air velocities for proper drying. However, air flow and air velocity are difficult to measure during operation because several sensors are needed to be placed at various directions of air flow and locations. Since there are some difficulties in modelling the complex phenomena, especially the gas turbulence, CFD is a powerful tool to aid the prediction of drying process. CFD has been used to predict the air flow and velocity during drying. Drying tests of several fruits were performed and the result showed that the degree of fruit dryness depended on its position within the drier. Determination of pressure profiles and air velocities by CFD showed that Versteeg, H., “An Introduction to Computational Fluid Dynamics”, Pearson Publications. ISBN 978-81-3172048-6, (2009). 149 Scott GM (1977), “Simulation of the flow of non-Newtonian foods using computational fluid dynamics”, Campden & Chorleywood Food Research Association R & D Report No. 34, UK. 150 Quarini J (1995), “Applications of Computational fluid dynamics in food and beverage production”. Food Sci Technol Today 9: 234-237. 151 Bin Xia, Da-Wen Sun, “Applications of computational fluid dynamics (CFD) in the food industry: a review”, Computers and Electronics in Agriculture 34 (2002) 5–24. 148

113

the main cause of the variations in drying rates and moisture contents was the lack of spatial homogeneity of air velocities within the drier. With the aid of CFD, researchers studied velocity fields in a modern sausage drier in order to provide information on air circulation inside the drier, which showed that CFD was able to predict the effects of filling level on air-flow patterns and also to identify measurement errors in areas where the main air flow direction was horizontal152. However, the quantitative comparison between the simulated and measured air velocities showed wide discrepancy with means of absolute differences of about 0.6 m/s. Although, the flow pattern and air velocity in the drier can be predicted using CFD modelling, further study on how to control the drying process and to reduce the energy cost is still a research topic for CFD modelling. Meanwhile, more attention should be paid on the assumptions such as spatial homogeneity because of such assumptions could lead to inaccuracy in prediction. CFD has also been used to investigate the performance and the design of spray dryers in the food industry. Spray dryers are used to produce products such as milk and coffee powder, as well as detergents. However, the design of spray dryers for the food industry is difficult because the performance of spray dryers is heavily influenced by the complexity of air and spray flow patterns inside the dryers. Therefore, there is considerable scope for the application of CFD simulation including optimum design of spray dryers and solutions for operational problems, such as wall deposition. In the past several years, researches, such as modelling and measuring the air flow pattern in a co-current pilot plant spray dryer (Kieviet et al., 1997) and analyzing the effects of air inlet geometry and spray cone angle on the wall deposition rate in spray dryers have been performed. All these studies show that there appears to be a large scope for using CFD for other purposes. For example, CFD can be used to simulate the air flow in a spray dryer in two dimensions and calculate the trajectories and the course of the drying process of the atomized particles. Straatsma153 developed a drying model utilizing turbulence model to calculate the gas flow field and showed that the drying model was an effective tool in giving indications of how to adapt the modelling in industrial dryers to obtain a better product quality or to optimize the drying performance of the unit. However, as the applications and specifications of dryers become more and more complex, so does the need for improved test work in pilot plants, and CFD simulations become more important in providing quick and valuable information. 6.1.2 Sterilization It is known that consumer demands for food products focus on safety, product quality and cost154. Therefore, it is of great necessity to enhance quality and assure safety of the food supply. Sterilization is an important technique for food storage and preservation. CFD can be used to study both temperature distribution and flow pattern of food in the process of sterilization so as to optimize the quality of food products. Thermal processing remains the most significant technique of sterilization which results in microbial inactivation, but in the meantime, quality loss and flavor development. Excessive heating will affect food quality and its nutritive properties. With the application of CFD, there has been a number of studies to optimize the thermal sterilization of foods. These studies had led to substantial improvement on the optimal control of the process and the retention of the nutritional and sensory quality of the food. Another researches carried out a series of research work in canned food sterilization with CFD simulation. The work varied from those simulating the changes of bacteria diffusion and their transient spatial distribution during sterilization process to those simulating natural convection heating within a can of liquid food during sterilization. It is only in See above. Straatsma, J., Houwelingen, V.G., Steenbergen, A.E., Jong, P.D.,“Spray drying of food products:Simulation model”, Journal of Food Engineering 42 (2), 67–72. 154 Bin Xia, Da-Wen Sun, “Applications of computational fluid dynamics (CFD) in the food industry: a review”, Computers and Electronics in Agriculture 34 (2002) 5–24. 152 153

114

recent years that the food pouches have been introduced to the market and, therefore, little or no study has been executed on sterilization of food in pouches. CFD code was used for the purpose to simulate the transient temperature, velocity profiles and the shape of the slowest heating zone in sterilization of carrot soup in pouches. The modelling of a continuous sterilization process to optimize the quality of safe food has also been developed and the results showed that CFD modelling could be of significant help to the liquid food sterilization. However, all of these investigations about CFD application in sterilization are on the thermal sterilization in the limited area of liquid foods. There are still remains many challenges in the area of sterilization with the application of CFD. For instance, Ultra-violet, visible and infra-red light surface sterilization, plasma/corona sterilization, electrons and X rays sterilization, nascent oxygen/ozone sterilization of fruits and vegetables, pressure sterilization of fresh fruit juices and cooked ham. The application of CFD in these sterilization fields of food is still to be developed in the future. Moreover, assumptions are normally made to simplify CFD modelling. For example, specific heat, thermal conductivity and volume expansion coefficient were assumed to be constants in the study by Abdul Ghania et al. (1999a) although, all the parameters are temperature dependent. More studies should be carried out to minimize these assumptions and thus to improve the accuracy of CFD prediction. Another area for the application of CFD is the real time control of the sterilization. Effective real-time monitoring of sterilization will improve the quality and safety of foods. Above all, the ultimate objective is to optimize the sterilization process of the food and to obtain food with excellent quality and safety. With the aid of CFD application, the sterilization process can be improved. 6.1.3 Mixing In the food processing industry, mixing is one of the most common operations. Mixing applications involve the substances of gas, liquid and solid. And the mixing of fluids is one of the most important unit operations for the food processing industry. However, mixing is a complicated process as regards to the multiphase turbulence during mixing and the design of a mixer. CFD is a powerful tool for the modelling of mixing processes. It provides a natural method to link food process and fluid flow information. During mixing, a common method of enhancing the process is to use some kind of stirrer or paddle. CFD codes have been applied in optimizing the mixing process to minimize energy input and to shorten the processing time. Therefore, research has been carried out on the distribution of energy in mixing vessel and on the effects of mixing quality when the stirrer is in different position. Such prediction of the mixing process within these units was impossible in the past. Recently, CFD modelling of mixing in stirred tanks has been carried out by [Sahu]155, with several important points about impeller-vessel geometry, energy balance and the linkage between the flow field and the design objective being addressed. Although no experiments were carried out in the study, the predicted values of mixing time were compared with published experimental data and the agreement was within 5–10%. This study will benefit the design of the stirred tanks, and some technical problems about the impeller types, mixing time and equipment size can be avoided. The design of mixing devices is an important topic in analyzing the mixing process. Therefore, some research work focusing on the application of CFD on the design of mixing devices, for instance, shallow bubble columns, has been investigated. The results of these studies will provide benefits including easy measurement of the drop size distribution, the velocities of the phases and the degree of mixing, and accurate description of the turbulence, swirling and vortices generated in the mixer. Thus, all the development of CFD application on the mixing in the food processing industry will lead to more accurate monitoring, control and optimizing of mixing process. In the meantime, it will form a good basis for mixing process improvement.

Sahu, A.K., Kumar, P., Patwardhan, A.W., Joshi, J.B., “CFD modelling and mixing in stirred tanks”, Chemical Engineering Science 54 (13–14), 2285–2293, 1999. 155

115

6.1.4 Refrigeration The consumption of frozen foods has increased continually in the past years because frozen foods have demonstrated good food quality and safety record. Refrigeration can slow down bacterial growth and preserve food. Therefore, researchers have recently applied CFD in the modelling of heat and mass transfer in foods during refrigeration (chilling and freezing). They have developed the modelling of air blast and vacuum cooling, chilling, cold chain, cold store, refrigerated room and refrigerated display cabinets. CFD simulation of heat and moisture transfer for predicting cooling rate and weight loss of cooked ham during air blast chilling process has been investigated. Both experimental and predicted results showed that the core temperature of the cooked ham was cooled down from 74.4 to 4°C within approximate 530 min. The experimental accumulative weight loss was 4.25%, while the simulated results were 4.07 and 4.22%, respectively, obtained from standard k–ε model and LRN k–ε model. At the same time the effect of fluctuation in inlet airflow temperature was studied, indicating that setting the boundary condition of airflow temperature is an important factor affecting the predicting accuracy. If a constant temperature was assumed for the inlet air, the weight loss (4.37%) was over predicted. Furthermore, the effects of different k–ε models and thermocouple positions on the prediction accuracy of CFD modelling of air-blast chilling process were also analyzed. Some developed a two-dimensional simulation model for the airflow in two industrial meat chillers. Recently, it was investigated the temperature increase in frozen food packaged in pallets in the distribution chain by means of CFD modelling. Good agreement was found between the experimental and modelling results with the differences normally within 10%. The study showed that the controlled temperature throughout the cold chain was necessary to ensure a high food quality with long storage duration. Although the modelling of air flow and temperature distribution has been well developed, models for phase transition, such as condensation and evaporation are not yet available. 6.1.5 Crystallization It is one of the oldest unit operation in the chemical and food industry but the design and operation of crystallization processes still pose many problems. However, until recently, there have been few tools capable of providing the required capabilities. This is because modelling of crystallization processes poses a number of challenges. The key challenge is representing the inherent physical and chemical complexity of crystallization phenomena mathematically and validating the resulting mathematical model against experimental data. CFD helps in modelling of crystallization process and design of crystallizer156. 6.1.6 Pasteurization Pasteurization is a vital unit operation which is used to inactivate the spoilage organisms and enzymes present in the milk. Similarly, CFD analysis for thermal pasteurization of intact eggs. Calculated temperature profiles were found to be in good agreement with experimentally observed data for eggs of different sizes. A generally accepted kinetic inactivation model for Salmonella enteritis’s was incorporated in the CFD analysis and provided a basis for process assessment. Minimum process times and temperatures to provide equivalent pasteurization effectiveness at 5 log reductions of the target microorganism were obtained on a theoretical basis. Combining a CFD analysis with inactivation kinetics proved to be a very useful approach for establishing process conditions leading to consumer safe eggs. Also, conducted in-package pasteurization for beer microbiological stabilization. A heating process was simulated at 60°C up to 15 PUs (a conventional beer process, in which 1 Pasteurization Unit (PU) is equivalent to 1minute at 60°C). The temperature profile and convection current velocity along the process and the variation of the PUs were evaluated

156 Kaushal and Sharma, “Concept of Computational Fluid Dynamics (CFD) and its Applications

Equipment Design”, J Food Process Technical 2012, 3:1.

in Food Processing

116

in relation to time considering the cans in the conventional, inverted, and horizontal positions. The package position did not result in process improvement.

6.2 CFD in Semiconductor Industry As with most of the technologies, one can know its applications only when it is invented157. Today CFD is being used to help in designing in every area where Fluid is involved. CFD has found its application with semiconductor industry as well. CFD solution can help immensely in reducing the number of experiments required to design various chip manufacturing equipment’s. After validation with experiments, one can find finer details more easily from CFD than with experiment e.g. temperature distribution over the surface, deposition rate, rate of desorption. Various semiconductor industries have started using CFD calculation to help their design engineers. But it still has a long way to go and gain confidence from everybody to its results. CFD (Computational Fluid Dynamics) could be used to model the thermal system at a board level as well as within a semiconductor chip, so that efficient heat-dissipation mechanisms and sufficient cooling systems could be designed around these systems. CFD could hold interesting possibilities given that we are now looking at three-dimensional (3D) transistor dies as well as multi-die two-dimensional (2D) packages. Heat dissipation is critical for the long-term reliability of semiconductor devices 6.2.1 Brief Description of Semiconductor Devices Semiconductor devices are electronic components that exploit the electronic properties of semiconductor materials, principally silicon, germanium, and gallium arsenide, as well as organic semiconductors. Semiconductor devices have replaced thermionic devices (vacuum tubes) in most applications. They use electronic conduction in the solid state as opposed to the gaseous state or thermionic emission in a high vacuum. In layman terms, semiconductor is the category of conductors which besides being a conductor of current is also an insulator. As evident from the diagram (see Figure 6.1), the energy band gap between valence band and conduction band is: • • •

large in case of Insulator overlap in case of Metal moderate in case of Semiconductor

Figure 6.1

157

CFD online.

Illustrates the various classes of conductors

117

What are they you ask? 1. Valence Band - Band in which electrons reside. 2. Conduction Band - Band to which electrons jump and conduct electricity. 3. Energy Band - Band which does nothing! Similarly for an electron to jump to conduction band, it requires energy. Such an amount of energy is almost impossible to provide to insulators but easily achievable in case of metals. In case of semiconductors energy can be tuned so as to make it work like metal or an insulator. The energy provided must be greater than the energy of band gap (>1eV). This feature enables semiconductors to be used as Switch. For switching ON the circuit you just need to provide energy greater than 1eV to the semiconductor device. Figure 6.2 demonstrates a semiconductor electronics. 6.2.2 Thermal Management in Semiconductors There are at least ten good reasons to include thermal measurements as a routine step in any electronic component or system design process158. Amid all the promotion of solid-state superlatives ranging from data rate to feature size to LED light output, one characteristic is never touted: Junction Temperature. That's because Junction Temperature (JT) is an undesired but unavoidable side-effect of high currents and/or switching speeds. A p-n junction, whether it is one of millions on a CPU chip or the only one within a power LED, generates heat. In the past two decades the Figure 6.2 Modern Semiconductor industry has seen heat dissipation increase by orders of magnitude. Faster is better, but faster is also hotter. This trend is not without consequences. A 10° increase in JT can cause a 50% reduction in a semiconductor device's life expectancy. In LEDs, both brightness and color can suffer as JT increases. And of course the twin issues of safety and cooling can impact the design of an entire system, not just the semiconductor device producing the heat. All these facts point toward the need for a thorough grasp of thermal behaviors at the chip level, and beyond. True understanding comes with physical measurements performed on actual devices. This is especially true in the world of semiconductors. You see heat dissipation in semiconductor packages is one of the limiting factors in miniaturization. One of the biggest concerns of circuit designers is reducing power that is continuously increasing due to bandwidths. As a result, the chip temperature increases. This change first Figure 6.3 Thermal Management of modifies and then later destroys the operation of the Semiconductor (courtesy of Mentor CFD) circuit if the heat is not correctly led out of the device. Being able to understand the true thermal 158

From Mentor CFD Blogs.

118

characteristics of a chip that will go inside an enclosure which is jam packed with other heat generating equipment can be very helpful. While most manufacturers publish thermal metrics for their chips, unfortunately not every manufacturer knows how to conduct an appropriate thermal characterization of their devices. So you can’t always rely on published metrics159. 6.2.3 Can You Really Fry an Egg on a CPU? An interesting questions arises whether you can really fry an egg in CPUs. Believe or not, somebody already try that160. Solving complex thermal models with computational fluid dynamics (CFD) requires a lot of processing power, and a central processing unit (CPU) under full load generates a fair amount of heat. But can you cook an egg on it? This article describes the model, simulations, and the ultimate conclusion. Solving complex thermal models with CFD requires a lot of processing power and a CPU under full load generates a fair amount of heat. But can you cook an egg on it? Before you throw away your conventional heatsink and fan in favor of a multifunctional omelet, we’ll investigate what CFD to predict about the fate of your PC if you do so. (see Figure 6.4). Unfortunately, the CPU junction temperature exceeds 90°C within 6 seconds, at which point the CPU clock would throttle down to reduce the thermal power and prevent damage to the system; less than ideal for a cooling solution. The egg would also burn and catch fire. The central location of the CPU on the board and the large obstacles to air flow in the neighboring memory DIMMS and I/O ports mean limited cold air can passively flow over the hot egg by natural convection. The passive cooling of the egg cannot match the forced convection of the stock cooler. An egg-based cooling solution would only keep the CPU below the maximum 90°C if the CPU performance were throttled down so there are only possible applications in lower power environments with plenty of ventilation. With the requirement of frequently swapping out the egg, it can’t Figure 6.4 An Example of an Egg Frying on a CPU this catching on. If the aim is to cook eggs though, CPUs certainly produce enough heat to do so; with thermal throttling, the processor acts as a thermostatically controlled surface at around 90°C, sufficient to cook on. If you value your computer, maybe consider buy a frying pan instead.

6.3 Magneto-Hydro-Dynamics (MHD) Magneto-hydro-dynamics (MHD; also magneto-fluid dynamics or hydro-magnetics) is the study of the magnetic properties of electrically conducting fluids. Examples of such magneto-fluids include 159 160

From Mentor CFD Blogs. James Forsyth, System-Level Design, Semiconductor engineering.

119

plasmas, liquid metals, salt water and electrolytes. The word "magneto-hydro-dynamics" is derived from magneto- meaning magnetic field, hydro- meaning water, and -dynamics meaning movement161. In a nutshell, MHD is the study of electrically conducting fluids, combining both principles of fluid dynamics and electromagnetism. According to [Battista]162, the subject of MHD is traditionally studied as a continuum theory, that is to say, attempts at studying discrete particles in the flows are not at a level such that computation in these regards is realistic. To run “realistic simulations” would require computations of flows with many more particles than current computers are able to handle. Thus, the only way to study MHD seems to be in its continuum form- leading us to its description using the Navier-Stokes fluids equations163. 6.3.1 MHD Equations The ideal MHD equations consist of the continuity equation, the Cauchy momentum equation, Ampere's Law neglecting displacement current, and a temperature evolution equation. As with any fluid description to a kinetic system, a closure approximation must be applied to highest moment of the particle distribution equation. This is often accomplished with approximations to the heat flux through a condition of adiabaticity or isothermally. The main quantities which characterize the electrically conducting fluid are the bulk plasma velocity field v, the current density J, the mass density ρ, and the plasma pressure p. The flowing electric charge in the plasma is the source of a magnetic field B and electric field E. All quantities generally vary with time t as described by Eq. 6.1. I. II. III. IV. V. VI. VII. VIII.

The two continuity equations for charge conservation where ρc = 0 because we are assuming the absence of an external charge distribution. The Cauchy momentum equation where the Lorentz force term J×B can be expanded using Ampere's law and the vector calculus identity where the first term on the right hand side is the magnetic tension force and the second term is the magnetic pressure force. The ideal Ohm's law for a plasma. Faraday's law. The low-frequency Ampere's law neglects displacement current. The magnetic divergence constraint. Energy equation where γ = 5/3 is the ratio of specific heats for an adiabatic equation of state. This energy equation is, of course, only applicable in the absence of shocks or heat conduction as it assumes that the entropy of a fluid element does not change. Hartmann number (Ha) is the ratio of electromagnetic force to the viscous force first introduced by Hartmann where B is the magnetic field, L is the characteristic length scale, σ is the electrical conductivity, μ0 is the dynamic viscosity.

From Wikipedia, the free encyclopedia. Nicholas A. Battista, “An Introduction to Magnetohydrodynamics”, Stony Brook University, December, 2010. 163 See Previous. 161 162

120

(I) (II) (III) (VI)

ρ C = .J = 0 t  B2  (B.)B    ρ + v.  v = J  B − p where J  B = −   μ0  t  Lorentz Force  2μ 0  B E + v  B = 0 (IV) = −  E (V) μ 0J =   B t d p  σ  γ  = 0 .B = 0 (VII) (VIII) Ha = BL dt  ρ  μ0

Eq. 6.1

The fundamental concept behind MHD is that magnetic fields can induce currents in a moving conductive fluid, which in turn polarizes the fluid and reciprocally changes the magnetic field itself. The set of equations that describe MHD are a combination of the Navier-Stokes equations of fluid dynamics and Maxwell's equations of electromagnetism (see Eq. 6.1). These differential equations must be solved simultaneously, either analytically or numerically. Figure 6.5 shows a RHR for forces in MHD. 6.3.2 Case Study - Dynamics of a Q2D Wake Behind a Cylinder in Presence of MHD Environment A confined laminar viscous flow past a two-dimensional bluff body in the presence of a strong uniform magnetic field is investigated by [Hamid, et al.]164. The effects of Reynolds number (Re) and Hartmann number (Ha) on the dynamics of the wake are examined, with a focus on the shedding frequency and the distribution of the wake vortices. These two parameters are of primary interest as they play an important role in determining the mixing and heat transfer properties of the downstream flow. The results indicates that the imposed magnetic field significantly alters the dynamic behavior of the wake behind a Figure 6.5 Right Hand Rule for MHD cylinder. It is well-known that beyond a critical Re, the flow around a circular cylinder generates a regular pattern of vortices known as the Karman vortex street. Analysis of such bluff body wakes are typically divided into three main focus areas: the correlation between drag coefficient, base pressure and shedding frequency; the vortex dynamics, where the formation and re-arrangement process are addressed; and the stability of the mean velocity profile in the wake. When a strong magnetic fluid is imposed to a conducting fluid, the resulting wake 164 A. H. A. Hamid, W.

K. Hussam and G. J. Sheard, “Dynamics of a Quasi-Two-Dimensional Wake Behind a Cylinder in an MHD Duct Flow with a Strong Axial Magnetic Field”, 19th Australasian Fluid Mechanics Conference, Melbourne, Australia, 8-11 December 2014.

121

possesses a distinct features as compared to the normal hydrodynamic flows. Typical example of such flows is in fusion power-reactor breeding blankets, where an electrically conducting fluid flows in channels within the blankets under a strong plasma-confining magnetic field. This class of flows are known as Magneto-Hydro-Dynamic (MHD). The interaction between induced electric currents and the applied magnetic field results in an electromagnetic Lorentz force, which in turn gives a damping effect to the flow and subsequently alters the formation of vortex street. 6.3.2.1 Numerical Method and Geometry In the current investigation a flow of electrically conducting fluid passing over a circular cylinder placed on the centerline of a duct is considered. Figure 6.6 depicts the numerical domain and the corresponding macro-element mesh. The ratio of cylinder diameter to the duct width (i.e. blockage ratio, b = d=2L) is fixed at 0.1 throughout this study. Also shown in the figure is a typical Hartmann velocity profile, characterized by a flat profile in the core with velocity U0 and high gradients in the vicinity of the lateral walls. The length scale is normalized by the half channel width, L. However, for the sake of discussions, the Re and the geometrical length in the succeeding discussions are presented in cylinder diameter scale, d. The use of two different length scales in an MHD cylinder wake flows is inevitable: the two-dimensional linear braking term is govern by Ha and L, whereas the Re and thus the structure of the cylinder wake is govern by d165. A quasi-two-dimensional (Q2D) model for MHD duct flow is employed166. Under this model, the non-dimensional magneto-hydro-dynamic equations of continuity and momentum reduce to

.u = 0

,

u 1 Ha = −(u.)u − p +  2u − 2 u t Re Re

Eq. 6.2

where u and p are the velocity and pressure fields, respectively. The governing equations are discretized using a high-order, in-house solver based on the spectral-element method.

Figure 6.6

Schematic diagram of numerical domain

Frank, M., Barleon, L. and Müller, U., 2001, “Visual analysis of two-dimensional magnetohydrodynamics”, Physics of Fluids, 13, 2287. 166 Sommeria, J. and Moreau, R., 1982, “Why, how, and when, MHD turbulence becomes two-dimensional”, Journal of Fluid Mechanics, 118, 507–518. 165

122

6.3.2.2 Result and Discussion In all simulations, two basic regions of wake vortices are apparent; a formation region in which the vorticity evolved from cylinder boundary-layers organizes into a vortex street, and a stable region in which the shed vortices convect downstream in a periodic laminar manner. This section presents the results of shedding frequency analysis and vortex distributions. In the current investigation, the effect of axial magnetic field on shedding frequency is of interest. It is to be noted that H = 0 correspond to hydrodynamic flows. The dimensionless frequency is represented by the Strouhal number, St = f d=U0, where f is shedding frequency, calculated from the fluctuating lift force imparted on the cylinder due to the nearwake flow unsteadiness. The Strouhal number is dependent on both Ha and Re. In the range of the Ha and Re Figure 6.7 Contour plots of vorticity snapshot at Red considered here, St increases with = 160 and at Hartmann number as indicated increasing Ha at a given Re. This observation can be attributed to the fact that the imposed magnetic field tends to stretch the shear layer at the near wake, and hence mass conservation requires that the wake advection velocity, Uw is increased. It can be seen in Figure 6.7 that stronger magnetic field intensity produces a narrower wake, thus extending the formation region behind the cylinder before the shear layer roll up into a vortex street. For detailed discussion, please see [[Hamid, et al.]167. In conclusion, The present study has investigated the characteristics of wakes behind a circular cylinders in a rectangular duct under a strong axial magnetic field using a spectral-element method. It is found that the formation of vortex shedding and the direction of the imposed magnetic field play significant roles in determining the shedding frequency. The present investigation reveals that an axial magnetic field tends to appreciably increase the St, regardless of flow Re. Furthermore, the advection speed of wake vortices is also a strong function of both Ha and Re, whereas Uw is only weakly dependent on Re for hydrodynamic flows.

6.4 Maxwell’s Equations - Electromagnetic Waves

6.4.1 Historical Perspective In 1845, Faraday demonstrated that a magnetic field produces a measurable effect on a beam of light. This prompted him to speculate that light involves oscillation of electric and magnetic field lines, but his limited mathematical ability prevent him from pursuing this idea. Maxwell, a young admirer of Faraday, believed that the closeness of these two numbers, speed of light and the inverse square root of ε0 and μ0, was more than just coincidence and decide to develop Faraday’s hypothesis. In 1865, he predicted the existence of electromagnetic waves that propagate at the speed of light. 6.4.2 The Finite-Difference Time-Domain Method (FDTD) The Finite-Difference Time-Domain method (FDTD) is today’s one of the most popular technique for

167

See 125.

123

the solution of electromagnetic problems168. It has been successfully applied to an extremely wide variety of problems, such as scattering from metal objects and dielectrics, antennas, micro strip circuits, and electromagnetic absorption in the human body exposed to radiation. The main reason of the success of the FDTD method resides in the fact that the method itself is extremely simple, even for programming a three-dimensional code. The technique was first proposed by [K. Yee]169, and then improved by others in the early 70s. The theory on the basis of the FDTD method is simple. To solve an electromagnetic problem, the idea is to simply discretize, both in time and space, the Maxwell’s equations with central difference approximations. The originality of the idea of Yee resides in the allocation in space of the electric and magnetic field components, and the marching in time for the evolution of the procedure. To better understand the theory of the method, we will start considering a simple one-dimensional problem. Assume, at this stage, “free space” as propagation medium. In this case, Maxwell’s equations can be written as

Eq. 6.3

∂𝐄 1 = ∇×𝐇 ∂t ε0

,

∂𝐇 1 = ∇×𝐄 ∂t μ0

6.4.3 Strengths of FDTD Modeling Every modeling technique has strengths and weaknesses, and the FDTD method is no different170. • •

•

•

• •

6.4.4 •

FDTD is a versatile modeling technique used to solve Maxwell's equations. It is intuitive, so users can easily understand how to use it and know what to expect from a given model. FDTD is a time-domain technique, and when a broadband pulse (such as a Gaussian pulse) is used as the source, then the response of the system over a wide range of frequencies can be obtained with a single simulation. This is useful in applications where resonant frequencies are not exactly known, or anytime that a broadband result is desired. Since FDTD calculates the E and H fields everywhere in the computational domain as they evolve in time, it lends itself to providing animated displays of the electromagnetic field movement through the model. This type of display is useful in understanding what is going on in the model, and to help ensure that the model is working correctly. The FDTD technique allows the user to specify the material at all points within the computational domain. A wide variety of linear and nonlinear dielectric and magnetic materials can be naturally and easily modeled. FDTD allows the effects of apertures to be determined directly. Shielding effects can be found, and the fields both inside and outside a structure can be found directly or indirectly. FDTD uses the E and H fields directly. Since most EMI/EMC modeling applications are interested in the E and H fields, it is convenient that no conversions must be made after the simulation has run to get these values. Weaknesses of FDTD Modeling Since FDTD requires that the entire computational domain be gridded, and the grid spatial discretization must be sufficiently fine to resolve both the smallest electromagnetic wavelength and the smallest geometrical feature in the model, very large computational domains can be developed, which results in very long solution times. Models with long, thin features, (like wires) are difficult to model in FDTD because of the excessively large

Lecture Series, Utah ECE. Kane Yee (1966). "Numerical solution of initial boundary value problems involving Maxwell's equations in isotropic media". IEEE Transactions on Antennas and Propagation. 14 (3): 302–307. 170 Wikipedia. 168 169

124

• • •

•

computational domain required. Methods such as Eigen mode Expansion can offer a more efficient alternative as they do not require a fine grid along the z-direction. There is no way to determine unique values for permittivity and permeability at a material interface. Space and time steps must satisfy the CFL condition, or the leapfrog integration used to solve the partial differential equation is likely to become unstable. FDTD finds the E/H fields directly everywhere in the computational domain. If the field values at some distance are desired, it is likely that this distance will force the computational domain to be excessively large. Far-field extensions are available for FDTD, but require some amount of post processing. Since FDTD simulations calculate the E and H fields at all points within the computational domain, the computational domain must be finite to permit its residence in the computer memory. In many cases this is achieved by inserting artificial boundaries into the simulation space. Care must be taken to minimize errors introduced by such boundaries. There are a number of available highly effective absorbing boundary conditions (ABCs) to simulate an infinite unbounded computational domain. Most modern FDTD implementations instead use a special absorbing "material", called a perfectly matched layer (PML) to implement absorbing boundaries.

Because FDTD is solved by propagating the fields forward in the time domain, the electromagnetic time response of the medium must be modeled explicitly. For an arbitrary response, this involves a computationally expensive time density, although in most cases the time response of the medium (or Dispersion (optics)) can be adequately and simply modeled using either the recursive convolution (RC) technique, the Auxiliary Differential Equation (ADE) technique, or the Z-transform technique. An alternative way of solving Maxwell's equations that can treat arbitrary dispersion easily is the Pseudo Spectral Spatial-Domain method (PSSD), which instead propagates the fields forward in space171. 6.4.5 Case Study - 1D Maxwell Equation In the one-dimensional case, we can use only Ex and Hy, and Eq. 6.3 can be rewritten as

∂Ex 1 ∂Hy = − ∂t ε0 ∂z

,

∂Hy 1 ∂Ex = − ∂t μ0 ∂z

Eq. 6.4 that represents a plane wave traveling in the z direction. Yee’s scheme consists in considering Ex and Hy shifted in space by half a cell and in time by half a time step when considering a central difference approximation of the derivatives. In such a case, equations (3) and (4) can be written as 1 n+ Ex 2 (k)

Eq. 6.5

Hyn+1 (k Eq. 6.6 171

Wikipedia.

− ∆t

1 n− Ex 2 (k)

+ 1/2) − ∆t

1 1 n n 1 Hy (k + 2) − Hy (k − 2) = − ε0 ∆z

Hyn (k +

1/2)

= −

1 μ0

1 n+ Ex 2 (k

+ 1) − ∆z

1 n+ Ex 2 (k)

125

Error! Reference source not found. & Eq. 6.6 show the usefulness of Yee’s scheme in order to have a central difference approximation for the derivatives. In particular, the left term in Eq. 6.5 says that the derivative of the E field at time nΔt can be expressed as a central difference using E field values at times (n+1/2)Δt and (nFigure 6.8 Illustration of a Standard Cartesian Yee cell used 1/2)Δt. The right term in Eq. 6.5 for FDTD for Electric and Magnetic Field approximates instead the derivative of the H field at point kΔx as a central difference using H field values at points (k+1/2)Δx and (k-1/2)Δx. This scheme is known as “leapfrog” algorithm. Practically, it means that to approximate Maxwell’s equations in space and time using this algorithm, one should calculate first all H field values, then all E field values, remembering always that E and H are shifted also in space by half of the discretization Δx. Figure 6.8 shows schematically the algorithm. 6.4.5.1 Boundary Conditions From the previous discussion, it is not clear what happens at the mesh termination. Of course, we cannot simulate the propagation of the signal indefinitely, and we need to terminate somehow the FDTD grid. The problem does not exist in the case of a spatially limited structure, like a waveguide, a resonator, etc., where we need to model a region that “trap” the field inside. In most of the problems, however, we need to simulate open space regions. In these cases, since our simulation region MUST be limited, we need to find a way to “simulate” the open space. These boundary conditions are called Radiation Boundary Conditions (RBCs) or Absorbing Boundary Conditions (ABCs). The absorbing boundary condition for the 1D case can be therefore expressed by 1 n+ Ex 2 (1)

=

1 n−2+ 2 (2) Ex

,

1 n+ Ex 2 (KE)

=

1 n−2+ 2 (KE Ex −

1)

Eq. 6.7 for the right side of the mesh, and KE represents the size of the arrays E and H. With these conditions, in the 1D simulation described in the previous section the wave will be completely “absorbed” by the termination. Of course, “completely” means actually “relatively”, since for numerical errors some small reflections from the boundary (noise) will be observed.

6.5 Mechanisms of Nanofluids172

6.5.1 What is a Nanofluid? In 1993, [Masua. et al.]173 measured thermal conductivity and viscosity of three different water-based suspensions contain ng Al2O3, TiO2, and SiO2 nanoparticles (particles with a size between 1 and 100 nm). They reported that both thermal conductivity and viscosity of the water become greater

Omid Mahian, Lioua Kolsi, Mohammad Amani, Patrice Estelle, Goodarz Ahmadi, Clement Kleinstreuer, Jeffrey S. Marshall, Majid Siavashi, Robert A. Taylor, Hamid Niazmand, Somchai Wongwises, Tasawar Hayat, Arun Kolanjiyil, Alibakhsh Kasaeian, Ioan Pop, “Recent advances in modeling and simulation of nanofluid flowsPart I: Fundamental and theory”, Physics Reports, 2018. 173 H. Masuda, A. Ebata, K. Teramae, N. Hishinuma, Alteration of Thermal Conductivity and Viscosity of Liquid by Dispersing Ultra-Fine Particles. Dispersion of Al2O3, SiO2 and TiO2 Ultra-Fine Particles, Netsu Bussei. 7 (1993). 172

126

than before by adding nanoparticles. Later in 1995, [Choi]174 selected the name “nanofluid” for a mixture of nanoparticles and a liquid. It should be kept in mind that nanofluids are not produced simply by adding nanoparticles to water or oil and stirring the mixture, similar to mixing sugar and tea, but rather the formation of a nanofluid requires special physical and chemical processes. In order to produce a uniform and efficient dispersion of particles for long-term application. Surfactant (noncovalent functionalization), functionalization of nanoparticles, control pH and sonication are some primary approaches for increasing the stability of nanofluids. Figure 6.9 presents the main physical and chemical approaches that might be used to prepare stable nanofluids. "the most suitable method for preparing stable nanofluids is determined based on the nan para ul type and the choice of the base liquid.

Figure 6.9

Nanofluid preparation is not just mixing nanoparticles and a liquid but special physical and chemical techniques are needed to have a stable nanofluid

6.5.2 What Applications are Suitable for Nanofluids? Indeed, in most applications in which conventional fluids 're used for cooling or heating purposes, nanofluids can be used to replace single-phase fluids. Nanofluids can be used in solar collectors and photovoltaic systems, car radiators, refrigerators, boilers, medicine-delivery systems, cooling of electronic equipment, lubrication of components, heating and cooling of building design, CO2 absorption, porous media, aerospace, oil recovery, and any type of liquid-based heat exchanger175176-177. Figure 6.10 gives an overview of primary nanofluid applications. S.U.S. Choi, J.A. Eastman, Enhancing thermal conductivity of fluids with nanoparticles, ASME Int. Mech. Eng. Congr. Expo. 66 (1995). 175 R. Saidur, K.Y. Leong, H.A. Mohammad, A review on applications and challenges of nanofluids, Renew. Sustain. Energy Rev. 15 (2011). 176 R. Taylor, S. Coulombe, T. Otanicar, P. Phelan, A. Gunawan, W. Lv, G. Rosengarten, R. Prasher, H. Tyagi, Small particles, big impacts: A review of the diverse applications of nanofluids, J. Appl. Phys. 113 (2013). 177 Z. Zhien, C. Jianchao, C. Feng, L. Hao, Z. Wenxian, Q. Wenjie, Progress in enhancement of CO 2 absorption by nanofluids: A mini review of mechanisms and current status, Renew. Energy. 118 (2018). 174

127

Figure 6.10

Some applications of nanofluids in a glance

6.5.3 What are the Advantages and Disadvantages of Nanofluids? Generally, by using a nanofluid the of heat exchange in thermal systems is enhanced because of higher thermal conductivity of nanofluids compared to the base fluid. Therefore, to transfer a specified value of heat, the size of a thermal syst m can e reduced by using nanofluids to enhance heat transfer rate, implying a more compact system and having in material weight and expense. In comparison with microfluids, nanofluids have a higher stability and better ability to enhance heat conduction. On the other hand, nanofluids have some disadvantages listed as follows178 : ➢ Nanofluids ha e a relatively high cost of production. ➢ Preservation of nanofluids for long-term use without aggregation and sedimentation of nanoparticles is a challenge. ➢ The higher viscosity of nanofluids compared to base fluids leads to increases in pumping power and increases the rate of frictional heating. ➢ Use of nanofluids can increase rates of corrosion and erosion of components into contact with the nanofluids. 6.5.4 CFD Techniques for Nanofluid Flow Solution Generally, it is difficult.' to so re the nanofluid flow problems analytically because of the nonlinear nature of the governing equations. Therefore, to find the velocity and temperature fields, (CFD) techniques may be used. In this part, the main CFD techniques that are used to determine the flow and heat transfer characteristics in nanofluid flows are briefly reviewed. O. Mahian, A. Kianifar, S.A. Kalogirou, I. Pop, S. Wongwises, A review of the applications of nanofluids in solar energy, Int. J. Heat Mass Transf. 57 (2013) 582–594. 178

128

These techniques can be classified into three main groups as follows and shown in Figure 6.11: • Macroscale based techniques • Mesoscale based techniques • Microscale based techniques

Microscopic

Mesoscopic

Microscopic

• Navier -Stokes Equations are solver approaches such as: • Finite Diflerence • Firinite Volume • Finite Eement

• Lattice Boltzmann Equation • Dissipative Parlicıle Dynamics Method

• Molecular Dynamics Simulation (Hamilton Equation)

Figure 6.11

Different CFD Techniques for Nano-Fluids

6.5.5 Macroscale Based Techniques Macroscale based techniques such as finite difference, finite volume, and finite element deal with converting partial diff entia' equations to algebraic equations through discretization. Here, the common macroscale based approaches are briefly reviewed. 6.5.5.1 Finite Differencing Method Euler was probably the first one who introduced “finite difference method” (FDM), to solve differential equations in more than 200 years ago. Finite difference method (FDM) is the oldest and easiest numerical approach to solve the flow and heat transfer problems. 6.5.5.2 Finite Volume Method The finite volume method ( FM) was introduced as an alternative to FDM to better handle complex flow domains. A review of literature shows that FVM is the most common CFD procedure used in solving nanofluid flow. In this approach, first the differential equations are converted into integral form. Next, domain of solution is divided into continuous control volumes, and the governing equation is applied to each of them. The values of desired parameters are calculated at the center of control volume, then at the surfaces of control volumes the parameter values are estimated by interpolation. Finally, for each control volume, the achieved integral terms are converted to algebraic equations by using quadrature formulae. The finite volume approach is easy to understand ' because every term has a clear physical description. The advantage of FVM is its flexibility: for use in complex flows. On the other, one of its main disadvantage is the difficulty of extending the method for higher

129

than 2nd order accuracy in 3D modeling. As mentioned earlier, most numerical studies on nanofluid is in used FVM. 6.5.5.2.1 Common FVM Solvers Two common FV base software packages used for simulation of nanofluid flows are ANSYS FLUENT and NTSYS CFX. For example, Esfahani et al. [17] simulated nanofluid flow in a wavy channel using ANSYS FLUENT and the single-phase approach. Effects of various models for thermophysical properties on the forced convection turbulent flow in a tube was investigated by Minea [18] where ANSYS FLUENT was used. Minea and Lorenzini [19] employed ANSYS FLUENT to solve natural convection of ZnO/water nanofluid in a cavity by assuming single ph. a homogenous model for nanofluid flow. Minea [20] utilized ANSYS FLUENT to anal' sis the variations of Brinkman number in a partially heated tube in which Al2O3/water nanofluid is used as coolant. Chereches et al. [21] employed ANSYSY FLUENT to simulate the flow of ion.. ofluid in a circular tube in both laminar and turbulent regimes while the flow was assumed be single-phase. In another study, Kaloudis et al. [22] used ANSYS CFX to model the nan ofuld fly in a parabolic trough collector by two-phase approach (Eulerian-Eulerian ). Figure 6.12 shows scale of the problems considered by [17,22]. Researchers are increasingly using open source software’s as OpenFOAM, which provides a C++ toolbox for solving nanofluid-based pie "len. ihe advantage of open-source software is both the reduced cost as well as the ease of apting and modifying the software for specific problems. For example, Meng et al. [23] utilized OpenFOAM to model nanofluid flow in a square cavity using both single-phase and twophase flow.

Figure 6.12 (a) Flow in a wavy Chuime solved by ANSYS FLUENT[17] (b) flow in a parabolic trough collector solved by ANSYS CFX[22] (figures are reprinted with permission from publisher)

6.5.5.3 Finite Element Method In finite element method (FEM) the domain of solution is divided into series of finite element (often unstructured) and governing equation are solved as weighted integrals over these elements. Before discretizing the governing equations, each term multiplied by a weighted function, the use of each is one of the main difference between FEM and FVM. The finial equations are in form on nonlinear algebraic equations. The main advantages of FEM is its flexibility in dealing with complex geometry. One of disadvantaged of FEM may be related to round off errors and its accumulation. Two FEM based software packages that have been used for nanofluid are COMSOL Multiphysics and FlexPDE, although the former package is more commonly only used. For example, Nasrin et al. [24] used COMSOL to model the pe." ma ce of a photovoltaic thermal (PV/T) system using nanofluids with the single-phase approach. They compared the results of the numerical simulation with experimental data and found computational agreement between them. In another work, Bianco et al. [25] investigated thermal efficiency and anta, y generation in a PV/T system using COMSOL where nanofluid flow was considered as a single case fluid. [Gunjo et al]. [26] investigated

130

melting and solidification in a shell-and-tube regenera. type latent heat storage system where the working fluid was a mixture of paraffin and nanopa tiu'as. They used COMSOL for the modeling by considering the temperature-dependent of prope.. as low Figure 6.13 (a)). Hatami and Jing [27,28] used FlexPDE to valu. te the performance of direct absorber solar collectors by nanofluids (see Figure 6.13 (b)).

Figure 6.13 (a) Flow in a shell and tube regenerative type latent heat storage system solved by COMSOL Multiphysics (b) Analysis of a Direct Solar Absorption Collector by FlexPDE[27] (Figure reprinted with permission from publisher)

6.5.6 Dynamics of Nanoparticle Motion in a Liquid Knowledge of the dynamics of nanoparticles in the base liquid is a prerequisite for accurately describing the processes of heat transfer in nanofluid flows. There are a wide range of forces that act on particles suspended in a fluid, but only a fraction of these forces are significant for nanofluids due to the small particle size. In general, forces acting on particles suspended in a fluid include those induced by: 1- The base liquid 2- Surrounding walls and solid surfaces 3- Other nanoparticles 4- External magnetic or electric fields (if any) 5- An acoustic field (if any) Figure 6.14 summarizes the main acting forces on a general suspended particle in a fluid flow, with detailed discussion provided in [Mahian et al.]179.

Omid Mahian, Lioua Kolsi, Mohammad Amani, Patrice Estelle, Goodarz Ahmadi, Clement Kleinstreuer, Jeffrey S. Marshall, Majid Siavashi, Robert A. Taylor, Hamid Niazmand, Somchai Wongwises, Tasawar Hayat, Arun Kolanjiyil, Alibakhsh Kasaeian, Ioan Pop, “Recent advances in modeling and simulation of nanofluid flowsPart I: Fundamental and theory”, Physics Reports, 2018. 179

131

Figure 6.14

Forces acting on a general particle suspended in a fluid flow by different sources

6.5.7 Other Macroscale Based Techniques There are some other macro scale based techniques that have been "ede solve nanofluid flows which are briefly reviewed here. 6.5.7.1 Control Volume Finite Element Method Control Volume Finite Element Method (CVFEM) is a powerful technique to solve fluid flows in complex geometries. Indeed, CVFEM is a combination FEM and FVM so that it has the benefits of these two techniques in itself.

132

6.5.7.2 Boundary Element Method The term "boundary element method” (BEM) is to describe a numerical approach that uses a Green's function formulation to convert a 3D partial differential equation to an integral equation over a surface, typically the bounding surface of already immersed in the flow field (typically called the 'boundary integral equation. This boundary integral equation can then be discretized to obtain a matrix system. When applicable, BEM nails very significant reduction of computation time since it is necessary only to solve for the element of the unknown variables on the bounding surface, rather than throughout the domain. 6.5.7.3 Spectral Method The spectral method is a global approach (unlike FDM) that is based on expansion of the dependent variables in eigen function expansions, which must be selected to be compatible with the differential equation, the boundary conditions and the coordinate system used for numerical solution. In a spectral method, the value of a parameter at each point is obtained by inversion of the eigenfunction expansions, and so it depends on information obtained from the whole computational. 6.5.7.4 Meshless Methods To use common CFD techniques such as FDM, FVM, and FEM, a mesh must first be generated, which can sometimes be a challenge. To solve the meshless (also called meshfree or gridless) methods have been proposed. In meshless method a set of nodes (without any connection) are scattered throughout the solution domain. The most common meshless methods are: • • •

Meshfree local Petrov-Galerkin Finite point methods - Radial basis function approach Meshfree boundary schemes

In comparison with a standard FEM, a meshless approach is easier to use for modeling 3D problems and can have higher accuracy. On the other hand, the computational cost of a meshless approach is often greater than a standard FEM. As an alternative, Lagrangian schemes are another example of meshless methods where no mesh is used in the solution. Generally, a et number of fluid molecules are distributed randomly in the flow domain. For example, in Molecular Dynamics (MD) and Dissipative Particle Dynamics (DPD), the motion of these muscular is governed by Newton's second law of motion where a set of forces are acting on the fluid. In MD, the forces between the molecules are calculated by using the inter-particle particles, such as the Lennard-Jones potential. In the DPD approach, the forces consist of friction forces, conservative forces, and random forces. Based on knowledge of the forces acting on molecules, the acceleration, velocity, and position of the particles can be calculated. This allows statistical averaging to be implemented to calculate the bulk properties of the domain such as density, viscosity, mass diffusivity, velocity, pressure, stream function, and vorticity. For thermal transport problems, the first law of Thermodynamics is applied to the fluid molecules which can then be solved for the molecules temperature in the flow domain. 6.5.7.5 Lattice Boltzmann Method The lattice Boltzmann method (LBM) was first introduced in the 1980s. The application of this method in simulating heat and mass transfer of fluids, especially in a plea geometries and multicomponent flows, has increased in intervening years due to its effeteness and flexibility and simplicity in comparison to the traditional computational fluid dynamics methods. The fundamental idea of LBM is based on the discrete movement theory of a set of artificial fluid particles' placed on lattices. The motion of these fluid particles is simulated by following evolution of a prescribed Boltzmann equation. The most essential advantage of the LBM is the incorporation of microscopic

133

physical interactions of the fluid particles in the numerical simulation and reveal the mesoscale mechanism of hydrodynamics. 6.5.7.6 Dissipative Particle Dynamics Method The continuum Navier-Stokes equations (NSE) re generally used to study energy transport in micro/nanoscale heat transfer applications such as in nonfluids. The major concern-particularly with nanoparticle aerosols-is whether the scale r. nan scale should be simulated or if the continuum assumption is still applicable. This issue can be resolved by comparing the mean free path of the base fluid to the scale of the problem under consideration . For aerosol systems for which mean free path is of order of tens of nm (e.g. the mea free paul of air is 60nm) the no-slip assumption may begin to break down, even for microscale systems. A similar breakdown of the continuum assumptions can occur in liquid nanofluid system is for problems involving length scales on the order of about 10nm or less. The most familiar approaches among discrete particle simulation methods is molecular dynamics (MD), where the process starts from tracking individual atoms of the fluid and calculating their velocities, accelerations and forces. From these microscopic details, further bulk properties such as temperature, pressure viscosity, density, mass diffusivity, and flow rates can be recovered using statistical sampling. sun analysis gives comprehensive data about microscopic details of the simulated system. however, the primary drawback of this method is the extremely small physical and time scales, which are not appropriate for tracking larger temporal and spatial scales that occur in the bulk heat transfer applications. In general, the MD method is computationally expensive and time consuming, even for cutting-edge supercomputers. The time and spatial scales often at transfer at the nanoscale are larger than MD, yet smaller than the conventional continuum scales. Thus, intermediate spatial and temporal scales must be captured. These scales can be car "red using mesoscopic particle-based methods, so-called “coarse-graining” methods, v'. research simulated particle represents a group of actual fluid molecules. These methods poss. "s" je unique capacity to model relatively bulk physical systems and efficiently capture the essential details of the pertinent interactions within fluid molecules. A relatively recent coarse-grained technique of this type is the dissipative particle dynamics (DPD) method. The dissipative particle dynamics (DPD) method is a coarse, rained version of MD where each DPD particle represents a group or packet of actual molecules DPD particles are randomly distributed in the flow domain, and particle interaction obey conservation of mass, momentum and energy. 6.5.8 Microscale Based Techniques (Molecular Dynamic Simulation) The fluid flow usually solved for nanofluids using the continuum theory, which involves the hypothesis of continuum medium. A mentioned in the previous section, the continuum approximation can begin to break down for problems at the smaller end of the nanometer size range (e.g. flow around carbon nanotubes, etc.) due to comparable characteristic dimensions of the fluid. 6.5.9 Flow Properties in Nanofluids Environment So far there are no general mechanisms to rationalize the Nomenclature strange behavior of nanofluids, α aspect ratio of nanoparticles including the highly improved β Ratio of the nanolayer thickness to the original effective thermal conductivity, particle radius although many possible factors have φ Volume fraction of nanoparticles in suspension been considered, including b Base fluid Brownian motion, liquid solid p Nanoparticle interface layer, ballistic phonon eff Effective transport, and Table 6.1 Nonculture used in this study

134

surface charge state180. However, there are still several other possible macro-scale explanations such as heat conduction, particle-driven natural convection, convection induced by electrophoresis, thermophoresis, etc. To facilitate better, Table 6.1 displays the nomenclature used in this study. 6.5.9.1 Density and Specific Heat The calculation of the effective density ρeff and the effective specific heat Cp,eff of a nanofluid is straightforward. The can be estimated based on the physical principle of the mixture rule as:

m𝑏 + m𝑝 m ρeff = ( ) = = (1 + 𝜑𝑝 )𝜌𝑏 + 𝜑𝑏 𝜌𝑝 v eff v𝑏 + v𝑝

Eq. 6.8 where φ is the volume fraction. And for effective specie heat we have

Q ) = (1 + φb )(ρCp)b + φp (ρCp )p m∆T eff (1 + φb )(ρCp)b + φp (ρCp ) p or (Cp )eff = (1 + 𝜑𝑝 )𝜌𝑏 + 𝜑𝑏 𝜌𝑝

(ρCp )eff = ρeff (

Eq. 6.9

6.5.9.2 Thermal Conductivity Currently, there is no reliable theory to predict the anomalous thermal conductivity of nanofluids. From the experimental results of many researchers, it is known that the thermal conductivity of nanofluids depends on parameters including the thermal conductivities of the base fluid and the nanoparticles, the volume fraction, the surface area, and the shape of the nanoparticles, and the temperature. There are no theoretical formulas currently available to predict the thermal conductivity of nanofluids satisfactorily. However, there exist several semi-empirical correlations for calculating the apparent conductivity of two-phase mixtures. They are mainly based on the following definition of the effective thermal conductivity of a two-component mixture

k eff

dT dT k p φp ( ) + k b φb ( ) k p + 2k b + 2(k p + k b )φ dx p dx b = = kb dT dT k + 2k − 2(k + k )φ p b p b φp ( ) + φb ( ) dx p dx b

Eq. 6.10 where kp is the thermal conductivity of the particle, kb is the thermal conductivity of the base fluid and φ is the particle volume fraction in the suspension. Maxwell’s formula shows that the effective thermal conductivity of nanofluids relies on the thermal conductivity of the spherical particle, the base fluid and the volume fraction of the solid particles. There are numerous suggestion proposed by researchers to analyze the interactions among randomly distributed particles181. 6.5.9.3 Viscosity Einstein (1956) was the first to calculate the effective viscosity of a suspension of spherical solids using the phenomenological hydrodynamic equations. By assuming that the disturbance of the flow Xiang-Qi Wang and Arun S. Mujumdar, “A Review on Nanofluids - Part I: Theoretical And Numerical Investigations”, Brazilian Journal of Chemical Engineering, October - December, 2008. 181 See previous. 180

135

pattern of the matrix base fluid caused by a given particle does not overlap with the disturbance of flow caused by the presence of a second suspended particle, he derived the following equations

μeff = (1 + 2.5φp )μb

Eq. 6.11 Even since Einstein’s initial work, researchers have made progress in extending the Einstein theory in three major areas. 1. To extend the Einstein equation to higher particle volume concentrations by including particleparticle interactions. The theoretical equation can be expressed as μeff = (1+ c1φp + c2φp2 + c3φp3 + ---)μb . 2. To take into account the fact that the effective viscosity of a mixture becomes infinite at the maximum particle volume concentration φp max .3. This theoretical equation usually has the term [ 1[φp/φp max)]α in the denominator. To include the effect of non-spherical particle concentrations ( see the [Wang and Mujumdar]182 Experimental data for the effective viscosity of nanofluids are limited to certain nanofluids. The ranges of the parameters (the particle volume concentration, temperature, etc.) are also limited. Still, the experimental data show the trend that the effective viscosities of nanofluids are higher than the existing theoretical predictions. In an attempt to rectify this situation, researchers proposed equations applied to specific applications. The problem with these equations is that they do not reduce to the Einstein equation at very low particle volume concentrations and, hence, lack a sound physical basis. 6.5.9.4 Heat Transfer Coefficient Since the heat transfer performance is a better indicator than the effective thermal conductivity for nanofluids used as coolants in transportation and other industries, the modeling of nanofluid heat transfer coefficients is gaining attention from researchers. However, it is still at an early stage, and the theoretical models for nanofluid heat transfer coefficients are quite limited. All the equations are modified from traditional equations such as the Dittus-Boelter equation [Dittus and Boelter]183, or the Gnielinski equation [Gnielinski]184 with empirical parameters added. Therefore, these equations are only valid for certain nanofluids over small parameter ranges. More experimental and theoretical studies are needed before general models can be developed and verified. Recently, [Polidori et al]185 investigated the natural convection heat transfer of Newtonian γ Al2O3/water nanofluids in a laminar external boundary-layer using the integral formalism approach. Based on a macroscopic modeling and under the assumption of constant thermophysical nanofluid properties, it is shown that natural convection heat transfer is not solely characterized by the nanofluid effective thermal conductivity and that the sensitivity to the viscosity model used seems undeniable and plays a key role in the heat transfer behavior. [Mansour et al.]186 investigated the effect of uncertainty in the physical properties of γ Al2O3/water nanofluid on its thermohydraulic performance for both laminar and turbulent fully developed forced convection in a tube with uniform Xiang-Qi Wang and Arun S. Mujumdar, “A Review on Nanofluids - Part I: Theoretical And Numerical Investigations”, Brazilian Journal of Chemical Engineering, October - December, 2008. 183 Dittus, F. and Boelter, L. Heat transfer in automobile radiators of the tubular type. University of California Publications in Engineering, 2, 443– 461 (1930). 184 Gnielinski, V. New equations for heat and mass transfer in turbulent pipe and channel flow. International Chemical Engineering, 16, 359–368 (1976). 185 Polidori, G., Fohanno, S., and Nguyen, C. T. A note on heat transfer modelling of Newtonian nanofluids in laminar free convection. International Journal of Thermal Sciences, 46, no. 8, 739–744 (2007). 186 Mansour, R. B., Galanis, N., and Nguyen, C. T. Effect of uncertainties in physical properties on forced convection heat transfer with nanofluids. Applied Thermal Engineering, 27, no. 1, 240–249 (2007). 182

136

heat flux. Since the effects of certain nanofluid characteristics such as average particle size and spatial distribution of nanoparticles on these properties are not presently known precisely, it is quite difficult to conclude as to the presumed advantages of nanofluids over conventional heat transfer fluids. More experimental data regarding these effects are needed in order to assess the true potential of nanofluids. 6.5.10 Numerical Simulation For numerical simulations two approaches have been adopted in the literature to investigate the heat transfer characteristics of nanofluids. The first approach assumes that the continuum assumption is still valid for fluids with suspended Nano-size particles. The other approach uses a two-phase model for a better description of both the fluid and the solid phases, but it is not common in the open literature. The single phase model is much simpler and computationally more efficient. Another approach is to adopt the Boltzmann theory. The heat transfer enhancement using nanofluids may be affected by several factors such as the Brownian motion, layering at the solid/liquid interface, ballistic phonon transport through the particles, nanoparticle clustering and the friction between the fluid and the solid particles. It is difficult to describe all these phenomena mathematically, however. [Maïga et al.]187-188 numerically investigated the hydrodynamic and thermal characteristics of nanofluids flowing through a uniformly heated tube (L = 1 m) in both laminar and turbulent regimes using the single phase model with adjusted properties. Results showed that the addition of nanoparticles can increase the heat transfer substantially compared to the base fluid alone. It was also found that ethylene glycol- γ Al2O3 provided better heat transfer enhancement than water- γ Al2O3 nanofluids. However, same researcher also discussed the disadvantages of nanofluids with respect to heat transfer. The inclusion of nanoparticles introduced drastic effects on the wall shear stress, which increased with an increase of the solid volume fraction. A new correlation was proposed by to describe the thermal performance of Al2O3 /water nanofluids under turbulent regime, 0.71 0.35 Nufully = 0.085 Re Pr , which is valid for 104 ≤ Re ≤ 5×105 , 6.6 ≤ Pr ≤13.9 and 0 ≤ φ ≤10%. [Roy et al.]189 conducted a numerical study of heat transfer for water-γ Al2O3 nanofluids in a radial cooling system. They found that addition of nanoparticles in the base fluids increased the heat transfer rates considerably. Use of 10 vol% nanoparticles resulted in a two-fold increase of the heat transfer rate as compared to that of the pure base fluid. Their results are similar to those of [Maiga et al.] since they both used the same model. [Wang et al.]190 investigated numerically free convective heat transfer characteristics of a two dimensional cavity over a range of Grashof numbers and solid volume fractions for various nanofluids. Their results showed that suspended nanoparticles significantly increased the heat transfer rate at all Grashof numbers. For water- γ Al2O3 nanofluid, the increase of the average heat transfer coefficient was approximately 30% for 10 vol% nanoparticles. The maximum increase in heat transfer performance of 80% was obtained for 10 vol% Cu nanoparticles dispersed in water. Furthermore, the average heat transfer coefficient was seen to increase by up to 100% for a nanofluid consisting of oil containing 1 vol% carbon nanotubes. Furthermore, the presence of nanoparticles in the base fluid was found to alter the structure of the fluid flow for horizontal orientation of the heated wall. 187 Maiga, S. E. B., Nguyen, C. T., Galanis, N., and Roy, G. Heat transfer behaviors of nanofluids in a uniformly heated

tube. Superlattices and Microstructures, 35, 543–557 (2004). 188 Maiga, S. E. B., Nguyen, C. T., Galanis, N., and Roy, G. Hydrodynamic and thermal behaviors of a nanofluid in a uniformly heated tube. volume 5 of Computational Studies, 453–462. WIT Press, Southampton, SO40 7AA, United Kingdom, Lisbon, Portugal (2004). 189 Roy, G., Nguyen, C. T., and Lajoie, P.-R. Numerical investigation of laminar flow and heat transfer in a radial flow cooling system with the use of nanofluids. Superlattices and Microstructures, (2004). 190 Wang, X.-Q., Mujumdar, A. S., and Yap, C. Free Convection Heat Transfer in Horizontal and Vertical Rectangular Cavities Filled with Nanofluids. In International Heat Transfer Conference IHTC-13. Sydney, Australia (2006).

137

Recently, [Abu-Nada]191 numerically investigated heat transfer over a backward facing step (BFS) with nanofluids using the finite volume method. They found that the average Nusselt number increased with the volume fraction of nanoparticles for the whole range of Reynolds number ( 200 ≤ Re ≤ 600 ) studied. Numerical simulation of natural convection in horizontal concentric annuli, horizontal cylinder and a partially heated rectangular enclosure using nanofluids was also carried out by [Abu-Nada et al. ]192. Results showed that, for nanoparticles such as Cu, Ag, Al2O3 and TiO2, the inclusion of different types and different volume fractions of nanoparticles in base fluid (water) had an adverse effect on heat transfer performance. From the microscopic point of view, the traditional computational methods for two-phase mixture flow do not reveal the inherent nature of the fluid flow and heat transfer characteristics of nanofluids. A microscopic approach needs to be introduced to describe the effects of interactions between the suspended nanoparticles and the base liquid particles as well as among the solid particles. The lattice Boltzmann equation is one of the methods available to deal with such problems. By considering the external and internal forces on the nanoparticles and the mechanical and thermal interactions among the nanoparticle and fluid molecules simulated nanoparticle distributions and flow of nanofluids using the lattice Boltzmann model. The increased temperature of the fluids could increase the nanoparticle distribution, which is an important factor responsible for heat transfer enhancement in nanofluids. [Xuan et al.]193 observed that the random motion of nanoparticles tends to flatten the temperature distribution near the boundary wall. Due to the irregular fluctuation of suspended nanoparticles, the Nusselt distribution fluctuates along the main flow direction rather than exhibiting the smooth distribution of the base fluid. Their results indicated that the distribution and volume fraction of the nanoparticles were important factors determining the temperature distribution and heat transfer improvement with nanofluids. Another interesting numerical investigation was conducted by [Xue et al.]194 using nonequilibrium molecular dynamics simulations. They studied the effect of the liquid-solid interface on the interfacial thermal resistance and found that the simple monatomic liquid around the solid particle had no influence on the thermal transport either normal to the surface or parallel to the surface. They suggested that the large improvement of thermal conductivity in nanofluids cannot be explained by thermal transport in the liquid-solid interface layer. In summary, it is difficult to identify an established theory to predict accurately the heat transfer characteristics of nanofluids. Many researchers deal with nanofluids as a single-phase fluid rather than a twophase mixture. However, the particle-liquid interaction and the movement between the particle and liquids should play important roles in affecting the convective heat transfer performance of nanofluids

Abu-Nada, E. Application of nanofluids for heat transfer enhancement of separated flows encountered in a backward facing step. International Journal of Heat and Fluid Flow, 29, no. 1, 242–249 (2008). 192 Abu-Nada, E., Masoud, Z., Hijazi, A. Natural convection heat transfer enhancement in horizontal concentric annuli using nanofluids. Int. Communications in Heat and Mass Transfer, 35, no. 5, 657–665 (2008). 193 Xuan, Y. and Yao, Z. Lattice Boltzmann model for nanofluids. Heat and Mass Transfer/Waerme- und Stoffuebertragung, 41, no. 3, 199–205 (2005). 194 Xue, L., Keblinski, P., Phillpot, S. R., Choi, S. U. S., and Eastman, J. A. Effect of liquid layering at the liquid-solid interface on thermal transport. International Journal of Heat and Mass Transfer, (2004). 191

138

139

7 Modern Computer Architectures 7.1 Background As we know, Computers and Software’s are one of the pillars of CFD and next two chapter are devoted to that. We argued that being a CFD analysis is not necessities a computer expert, nevertheless, knowing the essentials of it never hurts. (see Figure 7.1). So, it is wise to get familiar with modern computer architectures, as well as, software optimization, as details in [Severance & Dowd]195. Even if you can speed up the computational aspects of a processor infinitely fast, you still must load and store the data and instructions to and from a memory. Today's processors continue to creep ever closer to infinitely fast processing. But memory performance is increasing at a much slower rate (it will take longer for memory to become infinitely fast). Many of the interesting problems in high performance computing use a large amount of memory. As computers are getting faster, the size of problems they tend to operate on also goes up. The trouble is that when you want to solve these problems at high speeds, you need a memory system that is large, yet at the same time fast; a big challenge. Possible approaches include the following:

CFD

➢ Every memory system component can be made individually fast enough to respond to every memory access request. ➢ Slow memory can be accessed in a round-robin Figure 7.1 Contributions from other disciplines to CFD fashion (hopefully) to give the effect of a faster memory system. ➢ The memory system design can be made wide so that each transfer contains many bytes of information. ➢ The system can be divided into faster and slower portions and arranged so that the fast portion is used more often than the slow one. Again, economics are the dominant force in the computer business. A cheap, statistically optimized memory system will be a better seller than a prohibitively expensive, blazingly fast one, so the first choice is not much of a choice at all. But these choices, used in combination, can attain a good fraction of the performance you would get if every component were fast. Chances are very good that your high performance workstation incorporates several or all of them. Once the memory system has been decided upon, there are things we can do in software to see that it is used efficiently. A compiler that has some knowledge of the way memory is arranged and the details of the caches can optimize their use to some extent. The other place for optimizations is in user applications, as we'll see later in the 195

Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012.

140

book. A good pattern of memory access will work with, rather than against, the components of the system. Next, we discuss how the pieces of a memory system work. We look at how patterns of data and instruction access factor into your overall runtime, especially as CPU speeds increase. We also talk a bit about the performance implications of running in a virtual memory environment196.

7.2 Memory Technology Almost all fast memories used today are semiconductor-based197. They come in two flavors: Dynamic Random Access Memory (DRAM) and Static Random Access Memory (SRAM). The term random means that you can address memory locations in any order. This is to distinguishes from serial memories, where you have to step through all intervening locations to get to the particular one you are interested in. An example of a storage medium that is not random is magnetic tape. The terms dynamic and static have to do with the technology used in the design of the memory cells. DRAMs are charge-based devices, where each bit is represented by an electrical charge stored in a very small capacitor. The charge can leak away in a short amount of time, so the system has to be continually refreshed to prevent data from being lost. The act of reading a bit in DRAM also discharges the bit, requiring that it be refreshed. It's not possible to read the memory bit in the DRAM while it's being refreshed. SRAM is based on gates, and each bit is stored in four to six connected transistors. SRAM memories retain their data as long as they have power, without the need for any form of data refresh. DRAM offers the best price/performance, as well as highest density of memory cells per chip. This means lower cost, less board space, less power, and less heat. On the other hand, some applications such as cache and video memory require higher speed, to which SRAM is better suited. Currently, you can choose between SRAM and DRAM at slower speeds _ down to about 50 nanoseconds (ns). SRAM has access times down to about 7 ns at higher cost, heat, power, and board space. In addition to the basic technology to store a single bit of data, memory performance is limited by the practical considerations of the on-chip wiring layout and the external pins on the chip that communicate the address and data information between the memory and the processor. 7.2.1 Memory Access Time The amount of time it takes to read or write a memory location is called the memory access time. Whereas the access time says how quickly you can reference a memory location, cycle time describes how often you can repeat references. They sound like the same thing, but they're not. For instance, if you ask for data from DRAM chips with a 50-ns access time, it may be 100 ns before you can ask for more data from the same chips. This is because the chips must internally recover from the previous access. Also, when you are retrieving data sequentially from DRAM chips, some technologies have improved performance. On these chips, data immediately following the previously accessed data may be accessed as quickly as 10 ns198. 7.2.2 Memory Access Patterns The best pattern is the most straightforward: increasing and unit sequential. For an array with a single dimension, stepping through one element at a time will accomplish this. For multiplydimensioned arrays, access is fastest if you iterate on the array subscript offering the smallest stride or step size. In FORTRAN programs, this is the leftmost subscript; in C, it is the rightmost. The FORTRAN loop below has unit stride, and therefore will run quickly:

Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012. Magnetic core memory is still used in applications where radiation hardness resistance to changes caused by ionizing radiation is important. 198 Charles Severance, Kevin Dowd, “High Performance Computing”, Rice University, Houston, Texas, 2012. 196 197

141

DO J = 1 , N DO I = 1 , N A (I , J) = B (I , J) + C (I ,J ) * D ENDDO ENDDO In contrast, the next loop is slower because its pace is N. As N increases from one to the length of the cache line (adjusting for the length of each element), the performance worsens. Once N is longer than the length of the cache line (again adjusted for element size), the performance won't decrease: DO J = 1 , N DO I =1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO Here's a unit-stride loop like the previous one, but written in C: for (I = 0 ; I < n ; i++) for (j = 0 ; j < n ; j++) a [ I ] [j ] = a [ i] [ j ] + c [ I ] [ j ] * d; 7.2.2.1 Loop Interchange to Ease Memory Access Patterns Loop interchange is a good technique for lessening the impact of stride memory references. Let's revisit our FORTRAN loop with non-unit stride. The good news is that we can easily interchange the loops; each iteration is independent of every other: DO J = 1 , N DO I = 1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO After interchange, A, B, and C are referenced with the leftmost subscript varying most quickly. This modification can make an important difference in performance. We traded three N-stride memory references for unit strides: DO I =1 , N DO J = 1 , N A (J , I) = B (J , I) + C (J , I) * D ENDDO ENDDO 7.2.3 Virtual Memory Virtual memory decouples the addresses used by the program (virtual addresses) from the actual addresses where the data is stored in memory (physical addresses). Your program sees its address space starting at 0 and working its way up to some large number, but the actual physical addresses assigned can be very different. It gives a degree of flexibility by allowing all processes to believe they have the entire memory system to themselves. Another trait of virtual memory systems is that they divide your program's memory up into pages chunks. Page sizes vary from 512 bytes to 1 MB or

142

larger, depending on the machine. Pages don't have to be allocated contiguously, though your program sees them that way. By being separated into pages, programs are easier to arrange in memory, or move portions out to disk.

7.3 Registers At least the top layer of the memory hierarchy, the CPU registers, operate as fast as the rest of the processor. The goal is to keep operands in the registers as much as possible. This is especially important for intermediate values used in a long computation such as: X = G * 2.41 + A / W - W * M While computing the value of A divided by W, we must store the result of multiplying G by 2.41. It would be a shame to have to store this intermediate result in memory and then reload it a few instructions later. On any modern processor with moderate optimization, the intermediate result is stored in a register. Also, the value W is used in two computations, and so it can be loaded once and used twice to eliminate a wasted load. Compilers have been very good at detecting these types of optimizations and efficiently making use of the available registers since the 1970s. Adding more registers to the processor has some performance benefit. It's not practical to add enough registers to the processor to store the entire problem data. So we must still use the slower memory technology.

7.4 Caches Once we go beyond the registers in the memory hierarchy, we Register 2 ns encounter caches. Caches are small amounts of SRAM that store a L1 on-Chip 4 ns subset of the contents of the memory. The hope is that the cache L2 on-Chip 5 ns will have the right subset of main memory at the right time. The L3 on-Chip 30 ns actual cache architecture has had to change as the cycle time of Memory 220 ns the processors has improved. The processors are so fast that offchip SRAM chips are not even fast enough. This has led to a Table 7.1 Memory Access multilevel cache approach with one, or even two, levels of cache Speed on a DEC Alpha implemented as part of the processor. Table 7.1 shows the approximate speed of accessing the memory hierarchy on a 500-MHz DEC Alpha. When every reference can be found in a cache, you say that you have a 100% hit rate. Generally, a hit rate of 90% or better is considered good for a level-one (L1) cache. In level-two (L2) cache, a hit rate of above 50% is considered acceptable. Below that, application performance can drop off steeply. One can characterize the average read performance of the memory hierarchy by examining the probability that a particular load will be satisfied at a particular level of the hierarchy. For example, assume a memory architecture with an L1 cache speed of 10 ns, L2 speed of 30 ns, and memory speed of 300 ns. If a memory reference were satisfied from L1 cache 75% of the time, L2 cache 20% of the time, and main memory 5% of the time, the average memory performance would be: (0.75 * 10 ) + ( 0.20 * 30 ) + ( 0.05 * 300 ) = 28.5 ns You can easily see why it's important to have an L1 cache hit rate of 90% or higher. Given that a cache holds only a subset of the main memory at any time, it's important to keep an index of which areas of the main memory are currently stored in the cache. To reduce the amount of space that must be dedicated to tracking which memory areas are in cache, the cache is divided into a number of equal sized slots known as lines. Each line contains some number of sequential main memory locations, generally four to sixteen integers or real numbers. Whereas the data within a line comes from the

143

same part of memory, other lines can contain data that is far separated within your program, or perhaps data from somebody else's program, as in Figure 7.2 (Cache lines can come from different parts of memory). When you ask for something from memory, the computer checks to see if the data is available within one of these cache lines. If it is, the data is returned with a minimal delay. If it's not, your program may be delayed while a new line is fetched from main memory. Of course, if a new line is brought in, another has to be thrown out. If you're lucky, it won't be the one containing the data you are just about to need.

Figure 7.2

Cache Lines can come from Different Parts of Memory

On multiprocessors (several CPUs), written data must be returned to main memory so the rest of the processors can see it, or all other processors must be made aware of local cache activity. The problem can become very complex in a multiprocessor system. Caches are effective because programs often exhibit characteristics that help keep the hit rate high. This is called unit stride because the address of each successive data element is incremented by one and all the data retrieved into the cache is used. The following loop is a unit-stride loop: DO I = 1 , 1000000 SUM = SUM + A (I) END DO When a program accesses a large data structure using non-unit stride, performance suffers because data is loaded into cache that is not used. For example: DO I = 1 , 1000000 , 8 SUM = SUM + A (I) END DO This code would experience the same number of cache misses as the previous loop, and the same amount of data would be loaded into the cache. However, the program needs only one of the eight 32-bit words loaded into cache. Even though this program performs one-eighth the additions of the previous loop, its elapsed time is roughly the same as the previous loop because the memory operations dominate performance. While this example may seem a bit contrived, there are several situations in which non-unit strides occur quite often. First, when a FORTRAN two-dimensional array is stored in memory, successive elements in the first column are stored sequentially followed by the

144

elements of the second column. If the array is processed with the row iteration as the inner loop, it produces a unit-stride reference pattern as follows: REAL*4 A (200 , 200) DO J = 1 , 200 DO I = 1 , 200 SUM = SUM + A (I , J) END DO END DO Interestingly, a FORTRAN programmer would most likely write the loop (in alphabetical order) as follows, producing a non-unit stride of 800 bytes between successive load operations: REAL*4 A (200 , 200) DO I = 1 , 200 DO J = 1 , 200 SUM = SUM + A (I , J) END DO END DO Because of this, some compilers can detect this suboptimal loop order and reverse the order of the loops to make best use of the memory system. As we will see, however, this code transformation may produce different results, and so you may have to give the compiler _permission_ to interchange these loops in this particular example (or, after reading this book, you could just code it properly in the first place). while ( ptr ! = NULL ) ptr = ptr -> next; The next element that is retrieved is based on the contents of the current element. This type of loop bounces all around memory in no particular pattern. This is called pointer chasing and there are no good ways to improve the performance of this code. A third pattern often found in certain types of codes is called gather (or scatter) and occurs in loops such as: SUM = SUM + ARR ( IND (I)) where the IND array contains offsets into the ARR array. Again, like the linked list, the exact pattern of memory references is known only at runtime when the values stored in the IND array are known. Some special-purpose systems have special hardware support to accelerate this particular operation. 7.4.1 Cache Organization The process of pairing memory locations with cache lines is called mapping. Of course, given that a cache is smaller than main memory, you have to share the same cache lines for different memory locations. In caches, each cache line has a record of the memory address (called the tag) it represents and perhaps when it was last used. The tag is used to track which area of memory is stored in a particular cache line. The way memory locations (tags) are mapped to cache lines can have a beneficial effect on the way your program runs, because if two heavily used memory locations map onto the same cache line, the miss rate will be higher than you would like it to be. Caches can be organized in one of several ways: direct mapped, fully associative, and set associative.

145

7.4.1.1 Direct-Mapped Cache Direct mapping, as presented in Figure 7.3 (Many memory addresses map to the same cache line), is the simplest algorithm for deciding how memory maps onto the cache. Say, for example, that your computer has a 4-KB cache. In a direct mapped scheme, memory location 0 maps into cache location 0, as do memory locations 4K, 8K, 12K, etc. In other words, memory maps onto the cache size. Another way to think about it is to imagine a metal spring with a chalk line marked down the side. Every time around the spring, you encounter the chalk line at the same place modulo the circumference of the spring. If the spring is very long, the chalk line crosses many coils, the analog being a large memory with many locations mapping into the same cache line. Problems occur when alternating runtime memory references in a direct-mapped cache point to the same cache line. Each reference causes a cache miss and replaces the entry just replaced, causing a lot of overhead. The popular word for this is thrashing. When there is lots of thrashing, a cache can be more of a liability than an asset because each cache miss requires that a cache line be refilled an operation that moves more data than merely satisfying the reference directly from main memory. It is easy to construct a pathological case that causes thrashing in a 4-KB direct-mapped cache: REAL*4 A(1024), B(1024) COMMON /STUFF/ A , B DO I=1,1024 A(I) = A(I) * B(I) END DO The arrays A and B both take up exactly 4 KB of storage, and their inclusion together in COMMON assures that the arrays start exactly 4 KB apart in memory. In a 4-KB direct mapped cache, the same line that is used for A(1) is used for B(1), and likewise for A(2) and B(2), etc., so alternating references cause repeated cache misses. To fix it, you could either adjust the size of the array A, or put some other variables into COMMON, between them. For this reason one should generally avoid array dimensions that are close to powers of two.

Figure 7.3

Many memory addresses map to the same cache line

146

7.4.1.2 Fully Associative Cache At the other extreme from a direct mapped cache is a fully associative cache, where any memory location can be mapped into any cache line, regardless of memory address. Fully associative caches get their name from the type of memory used to construct them _ associative memory. Associative memory is like regular memory, except that each memory cell knows something about the data it contains. When the processor goes looking for a piece of data, the cache lines are asked all at once whether any of them has it. The cache line containing the data holds up its hand and says _I have it_; if none of them do, there is a cache miss. It then becomes a question of which cache line will be replaced with the new data. Rather than map memory locations to cache lines via an algorithm, like a direct- mapped cache, the memory system can ask the fully associative cache lines to choose among themselves which memory locations they will represent. Usually the least recently used line is the one that gets overwritten with new data. The assumption is that if the data hasn't been used in quite a while, it is least likely to be used in the future. Fully associative caches have superior utilization when compared to direct mapped caches. It's difficult to find real-world examples of programs that will cause thrashing in a fully associative cache. The expense of fully associative caches is very high, in terms of size, price, and speed. The associative caches that do exist tend to be small. 7.4.1.3 Set-Associative Cache Now imagine that you have two direct mapped caches sitting side by side in a single cache unit as shown in Figure 7.4 (Two-way set-associative cache). Each memory location corresponds to a particular cache line in each of the two direct-mapped caches. The one you choose to replace during a cache miss is subject to a decision about whose line was used last the same way the decision was made in a fully associative cache except that now there are only two choices. This is called a setassociative cache. Set-associative caches generally come in two and four separate banks of cache. These are called two-way and four-way set associative caches, respectively. Of course, there are benefits and drawbacks to each type of cache. A set associative cache is more immune to cache thrashing than a direct-mapped cache of the same size, because for each mapping of a memory address into a cache line, there are two or more choices where it can go. The beauty of a directmapped cache, however, is that it's easy to implement and, if made large enough, will perform roughly as well as a set-associative design. Your machine may contain multiple caches for several different purposes. Here's a little program for causing thrashing in a 4-KB two-way set- associative cache: REAL*4 A(1024), B(1024), C(1024) COMMON /STUFF/ A,B,C DO I=1,1024 A(I) = A(I) * B(I) + C(I) END DO Like the previous cache thrasher program, this forces repeated accesses to the same cache lines, except that now there are three variables contending for the choose set same mapping instead of two. Again, the way to fix it would be to change the size of the arrays or insert something in between them, in COMMON. By the way, if you accidentally arranged a program to thrash like this, it would be hard for you to detect it; aside from a feeling that the program runs a little slow. Few vendors provide tools for measuring cache misses.

147

Figure 7.4

Two-Way Set-Associative Cache

7.4.1.4 Instruction Cache So far we have glossed over the two kinds of information you would expect to find in a cache between main memory and the CPU: instructions and data. But if you think about it, the demand for data is separate from the demand for instructions. In superscalar processors, for example, it's possible to execute an instruction that causes a data cache miss alongside other instructions that require no data from cache at all, i.e., they operate on registers. It doesn't seem fair that a cache miss on a data reference in one instruction should keep you from fetching other instructions because the cache is tied up. Furthermore, a cache depends on locality of reference between bits of data and other bits of data or instructions and other instructions, but what kind of interplay is there between instructions and data? It would seem possible for instructions to bump perfectly useful data from cache, or vice versa, with complete disregard for locality of reference. Many designs from the 1980s used a single cache for both instructions and data. But newer designs are employing what is known as the Harvard Memory Architecture, where the demand for data is segregated from the demand for instructions. Main memory is a still a single large pool, but these processors have separate data and instruction caches, possibly of different designs. By providing two independent sources for data and instructions, the aggregate rate of information coming from memory is increased, and interference between the two types of memory references is minimized. Also, instructions generally have an extremely high level of locality of reference because of the sequential nature of most programs. Because the instruction caches don't have to be particularly large to be effective, a typical architecture is to have separate L1 caches for instructions and data and to have a combined L2 cache. For example, the IBM/Motorola PowerPC 604e has separate 32-K fourway set-associative L1 caches for instruction and data and a combined L2 cache.

7.5 Timing a Program Under UNIX, you can time program execution by placing the time command before everything else you normally type on the command line. When the program finishes, a timing summary is produced. For instance, if your program is called foo, you can time its execution by typing time foo. If you are

148

using the C shell or Korn shell, time is one of the shell's built-in commands. With a Bourne shell, time is a separate command executable in /bin. In any case, the following information appears at the end of the run: • • •

User time System time Elapsed time

These timing figures are easier to understand with a little background. As your program runs, it switches back and forth between two fundamentally different modes: user mode and kernel mode. The normal operating state is user mode. It is in user mode that the instructions the compiler generated on your behalf get executed, in addition to any subroutine library calls linked with your program.20 It might be enough to run in user mode forever, except that programs generally need other services, such as I/O, and these require the intervention of the operating system the kernel. A kernel service request made by your program, or perhaps an event from outside your program, causes a switch from user mode into kernel mode. Time spent executing in the two modes is accounted for separately. The user time figure describes time spent in user mode. Similarly, system time is a measure of the time spent in kernel mode. As far as user time goes, each program on the machine is accounted for separately. That is, you won't be charged for activity in somebody else's application. System time accounting works the same way, for the most part; however, you can, in some instances, be charged for some system services performed on other people's behalf, in addition to your own. Incorrect charging occurs because your program may be executing at the moment some outside activity causes an interrupt. This seems unfair, but take consolation in the fact that it works both ways: other users may be charged for your system activity too, for the same reason. Taken together, user time and system time are called CPU time. Generally, the user time is far greater than the system time. You would expect this because most applications only occasionally ask for system services. In fact, a disproportionately large system time probably indicates some trouble. For instance, programs that are repeatedly generating exception conditions, such as page faults, misaligned memory references, or floating-point exceptions, use an inordinate amount of system time. Time spent doing things like seeking on a disk, rewinding a tape, or waiting for characters at the terminal doesn't show up in CPU time. That's because these activities don't require the CPU; the CPU is free to go on and execute other programs. The third piece of information (corresponding to the third set of hands on the watch), elapsed time, is a measure of the actual (wall clock) time that has passed since the program was started. For programs that spend most of their time computing, the elapsed time should be close to the CPU time. Reasons why elapsed time might be greater are: • • • •

You are timesharing the machine with other active programs199 Your application performs a lot of I/O. Your application requires more memory bandwidth than is available on the machine. Your program was paging or swapped.

People often record the CPU time and use it as an estimate for elapsed time. Using CPU time is okay on a single CPU machine, provided you have seen the program run when the machine was quiet and noticed the two numbers were very close together. But for multiprocessors, the total CPU time can be far different from the elapsed time. Whenever there is a doubt, wait until you have the machine to The uptime command gives you a rough indication of the other activity on your machine. The last three fields tell the average number of processes ready to run during the last 1, 5, and 15 minutes, respectively. 199

149

your- self and time your program then, using elapsed time. It is very important to produce timing results that can be verified using another run when the results are being used to make important purchasing decisions. If you are running on a Berkeley UNIX derivative, the C shell's built-in time command can report a number of other useful statistics. Check with your csh manual page for more possibilities. In addition to figures for CPU and elapsed time, csh time command produces information about CPU utilization, page faults, swaps, blocked I/O operations (usually disk activity), and some measures of how much physical memory our program occupied when it ran. We describe each of them in turn. 7.5.1 Timing a Portion of the Program For some benchmarking or tuning efforts, measurements taken on the outside of the program tell you everything you need to know. But if you are trying to isolate performance figures for individual loops or portions of the code, you may want to include timing routines on the inside too. The basic technique is simple enough: 1. 2. 3. 4.

Record the time before you start doing X. Do X. Record the time at completion of X. Subtract the start time from the completion time.

If, for instance, X's primary job is to calculate particle positions, divide by the total time to obtain a number for particle positions/second. You have to be careful though; too many calls to the timing routines, and the observer becomes part of the experiment. The timing routines take time too, and their very presence can increase instruction cache miss or paging. Furthermore, you want X to take a significant amount of time so that the measurements are meaningful. Paying attention to the time between timer calls is really important because the clock used by the timing functions has a limited resolution. An event that occurs within a fraction of a second is hard to measure with any accuracy. 7.5.2 Getting Time Information In this section, we discuss methods for getting various timer values during the execution of your program. For FORTRAN programs, a library timing function found on many machines is called etime, which takes a two-element REAL*4 array as an argument and fills the slots with the user CPU time and system CPU time, respectively. The value returned by the function is the sum of the two. Here's how etime is often used: real*4 tarray(2), etime real*4 start, finish start = etime(tarray) finish = etime(tarray) write (*,*) 'CPU time: ', finish – start Not every vendor supplies an etime function; in fact, one doesn't provide a timing routine for FORTRAN at all. Try it first. If it shows up as an undefined symbol when the program is linked, you can use the following C routine. It provides the same functionality as etime: #include #define TICKS 100. float etime (parts) struct { float user;

150

float system; } *parts; { struct tms local; times (&local); parts->user= (float) local.tms_utime/TICKS; parts->system = (float) local.tms_stime/TICKS; return (parts->user + parts->system); } There are a couple of things you might have to tweak to make it work. First of all, linking C routines with FORTRAN routines on your computer may require you to add an underscore (_) after the function name. This changes the entry to float etime (parts). Furthermore, you might have to adjust the TICKS parameter. We assumed that the system clock had a resolution of 1/100 of a second (true for the Hewlett-Packard machines that this version of etime was written for). 1/60 is very common. On an RS-6000 the number would be 1000. You may find the value in a file named /usr/include/sys/param.h on your machine, or you can determine it empirically. A C routine for retrieving the wall time using calling get time of day is shown below: #include #include #include void hpcwall(double *retval) { static long zsec = 0; static long zusec = 0; double esec; struct timeval tp; struct timezone tzp; gettimeofday(&tp, &tzp); if ( zsec == 0 ) zsec = tp.tv_sec; if ( zusec == 0 ) zusec = tp.tv_usec; *retval = (tp.tv_sec - zsec) + (tp.tv_usec - zusec ) * 0.000001 ; } void hpcwall_(double *retval) { hpcwall(retval); } /* Other convention */ Given that you will often need both CPU and wall time, and you will be continually computing the difference between successive calls to these routines, you may want to write a routine to return the elapsed wall and CPU time upon each call as follows:

7.6 Subroutine Profiling Sometimes you want more detail than the overall timing of the application. But you don't have time to modify the code to insert several hundred etime calls into your code. Profiles are also very useful when you have been handed a strange 20,000-line application program and told to figure out how it works and then improve its performance. Most compilers provide a facility to automatically insert timing calls into your code at the entry and exit of each routine at compile time. While your program runs, the entry and exit times are recorded and then dumped into a file. A separate utility summarizes the execution patterns and produces a report that shows the percentage of the time spent in each of your routines and the library routines. The profile gives you a sense of the shape of the execution

151

profile. That is, you can see that 10% of the time is spent in subroutine A, 5% in subroutine B, etc. Naturally, if you add all of the routines together they should account for 100% of the overall time spent. From these percentages you can construct a picture a profile of how execution is distributed when the program runs. Though not representative of any particular profiling tool, the histograms in Figure 7.5-left (Sharp profile) and Figure 7.5-right (Flat profile ) depict these percentages, sorted from left to right, with each vertical column representing a different routine. They help illustrate different profile shapes. A sharp profile says that most of the time is spent in one or two procedures, and if you want to improve the program's performance you should focus your efforts on tuning those procedures. A minor optimization in a heavily executed line of code can sometimes have a great effect on the overall runtime, given the right opportunity. A at profile on the other hand, tells you that the runtime is spread across many routines, and effort spent optimizing any one or two will have little benefit in speeding up the program. Of course, there are also programs whose execution profile falls somewhere in the middle.

Figure 7.5

Sharp Profiling (right) vs. Flat Profiling (right)

We cannot predict with absolute certainty what you are likely to find when you profile your programs, but there are some general trends. For instance, engineering and scientific codes built around matrix solutions often exhibit very sharp profiles. The runtime is dominated by the work performed in a handful of routines. To tune the code, you need to focus your efforts on those routines to make them more efficient. It may involve restructuring loops to expose parallelism, providing hints to the compiler, or rearranging memory references. In any case, the challenge is tangible; you can see the problems you have to x. There are limits to how much tuning one or two routines will improve your runtime, of course. An often quoted rule of thumb is Amdahl's Law, derived from remarks made in 1967 by one of the designers of the IBM 360 series, and founder of Amdahl Computer, Gene Amdahl. Strictly speaking, his remarks were about the performance potential of parallel computers, but people have adapted Amdahl's Law to describe other things too. For our purposes, it goes like this: Say you have a program with two parts, one that can be optimized so that it goes infinitely fast and another that can't be optimized at all. Even if the optimizable portion makes up 50% of the initial runtime, at best you will be able to cut the total runtime in half. That is, your runtime will eventually be dominated by the portion that can't be optimized. This puts an upper limit on your expectations when tuning. Even given the finite return on effort suggested by Amdahl's Law, tuning a program with a sharp profile can be rewarding. Programs with _at profiles are much more difficult to tune. These are often

152

system codes, nonnumeric applications, and varieties of numerical codes without matrix solutions. It takes a global tuning approach to reduce, to any justifiable degree, the runtime of a program with a at profile. For instance, you can sometimes optimize instruction cache usage, which is complicated because of the program's equal distribution of activity among a large number of routines. It can also help to reduce subroutine call overhead by folding call lees into callers. Occasionally, you can find a memory reference problem that is endemic to the whole program and one that can be fixed all at once. When you look at a profile, you might find an unusually large percentage of time spent in the library routines such as log, exp, or sin. Often these functions are done in software routines rather than inline. You may be able to rewrite your code to eliminate some of these operations. Another important pattern to look for is when a routine takes far longer than you expect. Unexpected execution time may indicate you are accessing memory in a pattern that is bad for performance or that some aspect of the code cannot be optimized properly. In any case, to get a profile, you need a profiler. One or two subroutine profilers come standard with the software development environments on all UNIX machines. We confer two of them: prof and gprof. In addition, we mention a few line-by-line profilers. Subroutine profilers can give you a general overall view of where time is being spent. You probably should start with prof, if you have it (most machines do). Otherwise, use gprof. After that, you can move to a line-by-line profiler if you need to know which statements take the most time.

7.7 Loop Optimizations In nearly all high performance applications, loops are where the majority of the execution time is spent. In this chapter we focus on techniques used to improve the performance of these _clutter-free_ loops. Sometimes the compiler is clever enough to generate the faster versions of the loops, and other times we have to do some rewriting of the loops ourselves to help the compiler. It's important to remember that one compiler's performance enhancing modifications are another compiler's clutter. When you make modifications in the name of performance you must make sure you're helping by testing the performance with and without the modifications. Also, when you move to another architecture you need to make sure that any modifications aren't hindering performance. For this reason, you should choose your performance-related modifications wisely. You should also keep the original (simple) version of the code for testing on new architectures. Also if the benefit of the modification is small, you should probably keep the code in its most simple and clear form. The different loop optimization techniques, includes: ➢ ➢ ➢ ➢ ➢ ➢

Loop unrolling Nested loop optimization Loop interchange Memory reference optimization Blocking Out-of-core solutions

Someday, it may be possible for a compiler to perform all these loop optimizations automatically. Typically loop unrolling is performed as part of the normal compiler optimizations. Other optimizations may have to be triggered using explicit compile-time options. As you contemplate making manual changes, look carefully at which of these optimizations can be done by the compiler. Also run some tests to determine if the compiler optimizations are as good as hand optimizations. 7.7.1 Operation Counting Before you begin to rewrite a loop body or reorganize the order of the loops, you must have some idea of what the body of the loop does for each iteration. Operation counting is the process of

153

surveying a loop to understand the operation mix. You need to count the number of loads, stores, floating-point, integer, and library calls per iteration of the loop. From the count, you can see how well the operation mix of a given loop matches the capabilities of the processor. Of course, operation counting doesn't guarantee that the compiler will generate an efficient representation of a loop. But it generally provides enough insight to the loop to direct tuning efforts. Bear in mind that an instruction mix that is balanced for one machine may be imbalanced for another. Processors on the market today can generally issue some combination of one to four operations per clock cycle. Address arithmetic is often embedded in the instructions that reference memory. Because the compiler can replace complicated loop address calculations with simple expressions (provided the pattern of addresses is predictable), you can often ignore address arithmetic when counting operations. Let's look at a few loops and see what we can learn about the instruction mix: DO I=1,N A(I,J,K) = A(I,J,K) + B(J,I,K) ENDDO This loop contains one floating-point addition and three memory references (two loads and a store). There are some complicated array index expressions, but these will probably be simplified by the compiler and executed in the same cycle as the memory and floating-point operations. For each iteration of the loop, we must increment the index variable and test to determine if the loop has completed. A 3:1 ratio of memory references to floating-point operations suggests that we can hope for no more than 1/3 peak floating-point performance from the loop unless we have more than one path to memory. That's bad news, but good information. The ratio tells us that we ought to consider memory reference optimizations first. The loop below contains one floating-point addition and two memory operations a load and a store. Operand B(J) is loop-invariant, so its value only needs to be loaded once, upon entry to the loop: DO I=1,N A(I) = A(I) + B(J) ENDDO Again, our floating-point throughput is limited, though not as severely as in the previous loop. The ratio of memory references to floating-point operations is 2:1. The next example shows a loop with better prospects. It performs element-wise multiplication of two vectors of complex numbers and assigns the results back to the first. There are six memory operations (four loads and two stores) and six floating-point operations (two additions and four multiplications): for (i=0; i n) may be encountered. Usually this is caused by either using a suboptimal sequential

178

algorithm or some unique specification of the hardware architecture that favors the parallel computation. For example, one common reason for super linear speedup is the extra memory in the multiprocessor system. The speedup of any parallel computing environment obeys the Amdahl's Law. Amdahl's law states that if F is the fraction of a calculation that is sequential (i.e. cannot benefit from parallelization), and (1−F) is the fraction that can be parallelized, then the maximum speedup that can be achieved by using N processors is,

η=

1 F + (1 − F) / N

Eq. 8.2

In the limit, as N tends to infinity, the maximum speedup (η) tends to 1/F. In practice, performance ratio falls rapidly as N is increased once (1 − F)/N is small compared to F. As an example, if F is only 10%, the problem can be sped up by only a maximum of a factor of 10, no matter how large the value of N used. For this reason, parallel computing is only useful for either small numbers of processors, or problems with very low values of F: so-called embarrassingly parallel problems. A great part of the craft of parallel programming consists of attempting to reduce F to the smallest possible value (Figure 8.4). The Amdahl’s law indicates, the maximum you can speed up any code is limited by the amount that can be effectively parallelized. In other words: You are limited by the mandatory serial portions of your code208.

Figure 8.4

Amdahl's Law

For example, suppose 70% of a program can be sped up if parallelized and run on multiple CPUs instead of one. If is the fraction of a calculation that is sequential, and is the fraction that can be 208 Paul Edmon,”

Introduction to Parallel Programming and MPI “, FAS Research Computing, Harvard University.

179

parallelized, the maximum speedup that can be achieved by using P processors is given according to Amdahl's Law. Substituting the value for this example, using 4 processors we get, 2.105. Doubling the processors to 8 we get, 2.581. So in this case, doubling the processing power has only improved the speedup by roughly one-fifth. If the whole problem was parallelizable, we would, of course, expect the speed up to double also. Therefore, throwing in more hardware is not necessarily the optimal approach. 8.4.1 Weak vs. Strong Scaling In the context of high performance computing there are two common notions of scalability: •

•

The first is strong scaling, which is defined as how the solution time varies with the number of processors for a fixed total problem size. Or, execution time decreases in inverse proportion to the number of processors. Figure 8.5 displays an example of strung scaling used in NASA application FUN3D (M6 wing results below209). The second is weak scaling, which is defined as how the solution time varies with the number of processors for a fixed problem size per processor. Alternatively, execution time remains constant, as problem size and processor number are increased in proportion.

Figure 8.5

Example of Strong Scalability

8.4.2 Scalability vs. Performance First it’s critical for readers to understand the fundamental difference between scalability and performance. While the two are frequently conflated, they are quite different. Performance is the capability of particular component to provide a certain amount of capacity. Scalability, in contrast, is about the ability of a system to expand to meet demand. This is quite frequently measured by looking at the aggregate performance of the individual components of a particular system and how they function over time. Put more simply, performance measures the capability of a single part of a large system while scalability measures the ability of a large system to grow to meet growing demand. Scalable systems may have individual parts that are relatively low performing. I have heard that the 209

David E. Keyes, “Domain Decomposition Methods for Partial Differential Equations”, Columbia University.

180

Amazon.com retail website’s web servers went from 300 transactions per second (TPS) to a mere 3 TPS each after moving to a more scalable architecture. The upside is that while every web server might have lower individual performance, the overall system became significantly more scalable and new web servers could be added ad infinitum. Most x86 clusters today are built out for very high performance and scalability, but with a particular focus on performance of individual components (servers) and the interconnect network210. 8.4.3 Load Balancing A load balancer is a device that acts as a reverse proxy and distributes network or application traffic across a number of servers. Load balancers are used to increase capacity (concurrent users) and reliability of applications. They improve the overall performance of applications by decreasing the burden on servers associated with managing and maintaining application and network sessions, as well as by performing application-specific tasks. Load balancers are generally grouped into two categories: Layer 4 and Layer 7. Layer 4 load balancers act upon data found in network and transport layer protocols (IP, TCP, FTP, UDP). Layer 7 load balancers distribute requests based upon data found in application layer protocols such as HTTP. Requests are received by both types of load balancers and they are distributed to a particular server based on a configured algorithm. Some industry standard algorithms are: • • • •

Round robin Weighted round robin Least connections Least response time

Load balancers ensure reliability and availability by monitoring the "health" of applications and only sending requests to servers and applications that can respond in a timely manner. 8.5

Performance of CFD Codes

The method used to assess the performance of a parallel CFD solver is becoming a topic for debate. While some implementations use a fixed number of outer iterations to assess the performance of the parallel solver regardless of whether a solution has been obtained or not, other implementers use a fixed value for the residual as a basis for evaluation. Ironically, a large amount of implementers do not mention the method used in their assessment! The reason for this discrepancy is that the first group (who uses a fixed number of outer iterations) believes that the evaluation of the parallel performance should be done using exactly the same algorithm which justifies the use of a fixed number of outer iterations. This can be acceptable from an algorithmic point of view. The other group (who uses a fixed value for the maximum residual) believes that the evaluation of the parallel performance should be done using the converged solution of the problem which justifies the use of the maximum residual as a criterion for performance measurement. This is acceptable from an engineering point of view and from the user point of view. In all cases, the parallel code will be used to seek a valid solution! Now if the number of outer iterations is the same as that of the sequential version, tant mieux! The problem becomes more complicated when an algebraic multigrid solver is used. Depending on the method used in implementing the AMG solver, the maximum number of AMG levels in the parallel version will usually be less than that of the sequential version which raises the issue that one is not comparing the same algorithm. From an engineering point of view, the main concern is to obtain a valid solution for a given problem in a reasonable amount of time and thus, a

210

Randy Bias, “Grid, Cloud, HPC ... What's the Diff? “, Posted on Cloud scaling Blog, 2010.

181

user will not actually perform a sequential run and then a parallel run; rather, she will require the code to use as many AMG levels as possible. 8.5.1 CFD for Next Generation High Performance Computing High Performance Computing (HPC) is moving towards large scale parallelism. The Jaguar supercomputer, which is currently the fastest computer in the world, has over 200,000 processing cores. On chip parallelism has been increasing in regular processors (dual core, quad core, etc.) since 2001, but now larger scales of parallelism are being seen on a single chip. The introduction of Graphics Processing Units (GPUs), which have hundreds of cores on a single chip, into HPC represents a large change in the architectures being used for scientific computing. The scale of parallelism and new architectures requires novel numerical solvers to be written and optimized for solving CFD problems. 8.5.2 Hardware Consideration and CPU vs. GPU Technology GPUs have been traditionally used for rendering graphics, in which several relatively simple operations are performed identically on all parts of the input to produce an output image211. The nature of this work makes graphics rendering a good candidate for parallelization, GPUs reflect this in their architecture by having many cores. The differences between CPU and GPU architecture can be summarized in four points, and visualize at Figure 8.6:

Figure 8.6

• • • •

Architecture differences between CPU and GPU

CPUs contain few cores but have a relatively large cache (several Mbs). CPUs have many advanced features such as predictive branching, out of order execution and deep pipelines to improve throughput. GPUs have hundreds of cores split into groups which share control hardware and high speed memory (equivalent to cache). High speed memory is very small (a few Kbs). GPU cores are 'lightweight', i.e. they lack the advanced features of CPU cores.

Mark Mawson, Alistair Revell & Robert Prosser, “Computational Fluid Dynamics Codes For Next Generation High Performance Computing”, Computational Fluid Dynamics Group, University of Manchester. 211

182

8.5.2.1 Case Study 1 – 2D Laplace Equation A V-Cycle Multigrid method was written for GPUs to solve a 2D Laplace problem (Δu = 0). The Multigrid method solves a system of linear equations by restricting the error of the initial fine grid solution to increasingly coarse grids, and performing smoothing functions on them. It has been shown that higher frequency errors are more susceptible to smoothing operations than low frequency errors. By restricting the solution to coarser grids the relative frequency of the errors increases, making them susceptible to smoothing. Each level of coarseness will allow a low frequency error component to be smoothed, the solution at each grid level is then summed with the level above and further smoothing carried out to remove any errors introduced by the summation. The result is a solution that is smoothed across all frequencies of error. Figure 8.7 Results for V-Cycle Multigrid 8.5.2.2 Results GPU and CPU implementations of the VCycle Multigrid Method were tested on grids of size up to 4097×4097 elements. The maximum grid size was limited by the size of RAM on the GPU (4GB), future work will include allowing partition as of larger grids to be moved to and from the GPU. The GPU implementation performed up to 12× faster than the CPU version. 8.5.2.3

8.5.3 Future Work – Heterogeneous Computing In the V-cycle method shown, the CPU is idle while functions run on the GPU. The principle of heterogeneous computing is that the CPU will perform other tasks while the GPU is being used. For CFD applications this could include using the Figure 8.8 CPU to assist with the solver or, for time dependent problems, post processing of the previous time step while the GPU calculates the latest time step (see Figure 8.8).

Heterogeneous Computing using CPUs and GPUs

183

8.5.3 Case Study 2 - Unstructured Grid Based CFD Solvers on Modern Graphics Hardware The 3D Euler equations for inviscid, compressible flow are considered by [Corrigan et. al.]212. Effective memory bandwidth is improved by reducing total global memory access and overlapping redundant computation, as well as using an appropriate numbering scheme and data layout. The applicability of per-block shared memory is also considered. The performance of the solver is demonstrated on two benchmark cases: a missile and the NACA0012 wing. For a variety of mesh sizes, an average speed-up factor of roughly 9.5X is observed over the equivalent parallelized OpenMP code running on a quad-core CPU, and roughly 33x over the equivalent code running in serial. 8.5.3.1 Background and Literature Survey Recently, GPUs (Graphics Processing Units) have seen a tremendous increase in performance, In addition to this high computational performance, the latest modern graphics hardware offers increasing memory capacity, as well as support for 64-bit floating point arithmetic. Parallel, multicore processors, GPUs offer tremendous potential for applications in computational fluid dynamics. In order to fully exploit the computational power of such hardware, considerable care is required in the coding and implementation, particularly in the memory access pattern. GPUs have generalpurpose global memory, which is not automatically cached and exhibits high latency in comparison with the instruction throughput of GPUs. Furthermore, with earlier CUDA-enabled GPUs, there were stringent requirements for achieving optimal effective memory bandwidth, with a large loss of performance when these requirements went unmet. With the data-dependent memory access of unstructured grid based solvers, this loss of performance is almost assured. However, with due care, structured grid based solvers can meet these requirements due to the regular memory access patterns of such solvers, as described in the work of (Brandvik & Pullan), and (Tolke). Further work on regular grid solvers includes that of (Phillips et al.) who have developed a 2D compressible Euler solver on a cluster of GPUs, and [Thibault et al.]213 who have implemented a 3D incompressible Navier - Stokes solver for multi-GPU systems. So far, the implementation of optimized unstructured grid based solvers for modern graphics hardware has been relatively rare, perhaps due to these stringent requirements. In fact, just prior to its first release, [Owens et al.]214 comprehensively surveyed the field of general-purpose computation on graphics hardware (GPGPU), which included a number of primarily structured grid based solvers, such as those of [Harris]215, [Scheidegger et al.]216, and [Hagen et al.]217 However, the architecture has changed substantially and many of the limitations of GPGPU via traditional graphics APIs such as OpenGL are no longer an issue. The most recent CUDA-enabled GPUs have looser requirements for achieving high effective memory bandwidth. Roughly speaking, memory no longer needs to be accessed in a specific order by consecutive threads. Rather, high effective memory bandwidth can be achieved as long as consecutive threads access nearby locations in memory, which is called coalescing. Thus, if an Andrew Corrigan, Fernando Camelli, Rainald Lohner, and John Wallin, “Running Unstructured Grid Based CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 213 Thibault1, J. and Senocak, I., “CUDA Implementation of a Navier-Stokes Solver on Multi-GPU Desktop Platforms for Incompressible Flows," 47th AIAA Aerospace Sciences Meeting Including The New Horizons Forum and Aerospace Exposition, No. AIAA 2009-758, January 2009. 214 Owens, J. D., Luebke, D., Govindaraju, N., Harris, M., Krger, J., Lefohn, A. E., and Purcell, T. J., “A Survey of General-Purpose Computation on Graphics Hardware," Computer Graphics Forum, Vol. 26, No. 1, 2007. 215 Harris, M., “Fast Fluid Dynamics Simulation on the GPU," GPU Gems, chap. 38, Addison-Wesley, 2004. 216 C. Scheidegger, J. Comba, R. C., “Practical CFD simulations on the GPU using SMAC." Computer Graphics Forum, Vol. 24, 2005. 217 12Hagen, T., Lie, K.-A., and Natvig, J., “Solving the Euler Equations on Graphics Processing Units," Proceedings of the 6th International Conference on Computational Science, Vol. 3994 of Lecture Notes in Computer Science, Springer, May 2006. 212

184

appropriate memory access pattern is obtained, one can expect that modern GPUs will be capable of achieving high effective memory bandwidth and in general high performance for unstructured grid based CFD solvers. The purpose of this work is to study techniques which achieve this. 8.5.3.2 Implementation on Graphics Hardware The performance-critical portion of the solver consists of a loop which repeatedly computes the time derivatives of the conserved variables [see the Corrigan et. al.]218. The conserved variables are then updated using an explicit Runge-Kutta time-stepping scheme. The most expensive computation consists of accumulating flux contributions and artificial viscosity across each face when computing the time derivatives. Therefore, the performance of the CUDA kernel which implements this computation is crucial in determining whether or not high performance is achieved, and is the focus of this section. 8.5.3.3 Test Cases The performance of the GPU code was measured on a prototype NVIDIA Tesla GPU, supporting compute capability 1.3, with 24 multiprocessors. A NACA 0012 wing in supersonic (M = 1.2 ; α = 0) flow was used as a test case where the pressure contours are plotted in Figure 8.9-(Left). Timing measurements when running in single-precision and for a variety of meshes, showing an average performance scaling factor of 9.4X in comparison to the OpenMP code running on four cores and 32.6X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor of 3.9X using redundant computation in comparison to pre-computed flux contributions. Timing measurements when running in double-precision are given in Figure 8.10-(TOP) for a variety of meshes, showing an average performance scaling factor of 1.56X in comparison to the OpenMP code running on four cores and 4.7X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor of 1.1X using redundant computation in comparison to pre-computed flux contributions.

Figure 8.9

Pressures at the Surface and Plane for the NACA 00012 (Left) and at the Surface for the Missile (Right)

Andrew Corrigan, Fernando Camelli, Rainald Lohner, and John Wallin, “Running Unstructured Grid Based CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 218

185

Figure 8.10

Running Times in double Precision Per Element Per Iteration for the NACA0012 (top) and Missile (bottom)

186

A missile in supersonic (M = 1.2 ; α= 8 degrees) flow was used as an additional test case. The pressure contours are plotted in Figure 8.9-(right). Timing measurements when running in double-precision are given in Figure 8.10-(bottom) for a variety of meshes, showing an average performance scaling factor of 2.5x in comparison to the OpenMP code running on four cores and 7.4X in comparison to the OpenMP code on one core. Furthermore, the code running on graphics hardware is faster by a factor 1.63X using redundant computation in comparison to pre-computed flux contributions. For additional, consult the [Corrigan et. al.]219.

8.6

Software Consideration and Message Passing Interface (MPI)

MPI (Message Passing Interface) is a message passing standard for homogeneous and heterogeneous parallel and distributed computing systems. The development of the MPI standard is a multinational effort which was initiated in 1992 and is supported by ARPA, NSF and the Commission of the European Community. A good introduction to MPI is provided by Foster220, and a brief description is presented in221. MPI is a library, not a language. It consists of subroutines that are called from FORTRAN, C, or C++ programs to facilitate parallelization of programs. An MPI program includes one or more processes which communicate with each other through calls to MPI library routines. There are two types of communications, namely, point-to-point communication between pairs of processes, and collective communication between groups of processes. Several variants of receive & send" functions are provided to enable users to achieve peak performance. Two basic syntax in FORTRAN and C of “hello world program” is provided above222.

#include /* Need to include this to be able to hook into the MPI API */ #include int main(int argc, char *argv[]) { int numprocs, rank; /* Initializes MPI */ MPI_Init(&argc, &argv); /* Figures out the number of processors I am asking for */ MPI_Comm_size(MPI_COMM_WORLD, &numprocs); /* Figures out which rank we are */ MPI_Comm_rank(MPI_COMM_WORLD, &rank); printf("Process %d out of %d\n", rank, numprocs); /* Need this to shutdown MPI */ MPI_Finalize(); } #include PROGRAM hello /* Need to include this to to bebe able to to hook into thethe MPI APIAPI */ !### Need to include this able hook into MPI #include ### int main(int argc, char *argv[]) { INCLUDE 'mpif.h' int numprocs, rank; INTEGER*4 :: numprocs, rank, ierr /* Initializes MPI */ ### !### Initializes MPI MPI_Init(&argc, &argv); CALL MPI_INIT(ierr) /* Figures outout thethe number of of processors I am asking forfor */ !### Figures number processors I am asking MPI_Comm_size(MPI_COMM_WORLD, &numprocs); ### /* Figures out which rank we are */ CALL MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, MPI_Comm_rank(MPI_COMM_WORLD, &rank); ierr) printf("Process out of %d\n", rank, !### Figures out%d which rank we are ###numprocs); /* Need this to shutdown MPI */ CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) MPI_Finalize(); write(*,*) 'Process', rank, 'out of', numprocs }!### Need this to shutdown MPI ### CALL MPI_FINALIZE(ierr) END PROGRAM hello

PROGRAM hello !### Need to include this to be able to hook into the MPI API 219 Andrew Corrigan, Fernando Camelli, Rainald### Lohner, and John Wallin, “Running Unstructured Grid Based INCLUDE 'mpif.h' CFD Solvers on Modern Graphics Hardware”, 19th AIAA Computational Fluid Dynamics, 2009. 220 I. Foster, “Designing and Building Parallel Programs." INTEGER*4 :: numprocs, rank, ierr http://www.mcs.anl.gov/dbpp/. 221 L. Clarke, I. Glendinning, and R. Hempel, “The MPI !###Message InitializesPassing MPI ### Interface Standard," March 1994. 222 Paul Edmon, “Introduction to Parallel Programming CALL MPI_INIT(ierr) and MPI”, FAS Research Computing, Harvard !### Figures out the number of processors I am asking for University. ### CALL MPI_COMM_SIZE(MPI_COMM_WORLD, numprocs, ierr) !### Figures out which rank we are ### CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr) write(*,*) 'Process', rank, 'out of', numprocs

187

8.7

Cloud Computing: Definition and Features223

From the technical definition (Mell et al. 2011) “cloud computing is a model for enabling ubiquitous, convenient on-demand network access to a shared pool of resources (e.g., network servers, storage, applications and services) that can be rapidly provisioned and released with minimal management effort or service provider interaction.” Clouds have three service models as described below: 8.7.1 “Infrastructure as a Service” (IaaS) Refers to on-demand provisioning physical resources, usually in terms of Virtual Machines (VMs). The consumers can deploy and run arbitrary software and they do not manage or control the underlying cloud infrastructure. Examples of IaaS providers include: Amazon EC2, Windows Azure Virtual Machines, Google Compute Engine, GoGrid, and Flexi scale. 8.7.2 “Platform as a Service” (PaaS) Refers to platform layer resources, including operating system support, database, web server and software development frameworks. This means users host an environment for their applications and can control them, but they cannot control the operating system, hardware or network that are using. PaaS provider examples include: Google App Engine and Microsoft Windows Azure Compute. 8.7.3 The last model is “Software as a Service” (SaaS) Refers to providing on-demand applications over Internet, in a way that providers install and operate application software on a cloud infrastructure and users access them through an interface via Internet. SaaS pricing model is typically yearly or monthly. Related examples include: Google Apps, Microsoft Office 356 and Autodesk 360. In addition, clouds have four deployment models. In Public Cloud the infrastructure is provisioned by general public for open use. In Private Cloud, the infrastructure is provisioned by single organization for exclusive use. It can be managed by organization or third parties or some combination of them. Private cloud offers the highest degree of control over performance, reliability and security. However, it does not provide benefits such as no up-front cost. In Community Cloud, the infrastructure is shared by several organizations with same concerns. Hybrid Cloud is a combination of two or more clouds that remain unique entities, but are bound together by standardized or proprietary technologies that enables data and application portability Cloud computing provisioning relies heavily on virtualization, which enables offering the homogenous service simultaneously to all users. The main reasons of virtualization are abstraction and encapsulation. One important advantage of the virtualization is that on the same physical infrastructure, different runtime environment can exist without re-initialization of hardware. Runtime environment can be managed easily and start/stop quickly

8.8 High Performance Computing (HPC) Scientists, engineers and analysts in virtually every field are turning to High Performance Computing (HPC) to solve today’s vital and complex problems224. Simulations are increasingly replacing expensive physical testing, as more complex environments can be modeled and in some cases, fully simulated. High-performance computing encompasses advanced computation over parallel processing, enabling faster execution of highly compute intensive tasks such as climate research, molecular modeling, physical simulations, cryptanalysis, geophysical modeling, automotive and aerospace design, financial modeling, data mining and more. Figure 8.11 shows Maui High Performance Computing Center with 1280 servers and Mellanox InfiniBand interconnect 42.3T Flops 223 224

2014 ASHRAE/IBPSA-USA Building Simulation Conference Atlanta, GA September 10-12, 2014 Mellanox Technologies Inc., 2006.

188

(Courtesy’s of Mellanox Technologies). HPC clusters have become the most common building blocks for high-performance computing, not only because they are affordable, but because they provide the needed flexibility and deliver superior price/performance compared to proprietary Symmetric Multi-Processing (SMP) systems, with the simplicity and value of industry standard computing. Realworld application performance depends on the performance of the various cluster’s key elements the processor, the memory, and the interconnect. The interconnect controls the data transfer between servers, and has a high influence on the CPU efficiency and memory utilization. Transport off-load interconnect architectures, unlike the “on-loading” ones, eliminate the need of dealing with the protocol processing within the CPU and therefore increase the number of cycles available for computational tasks. If the CPU is busy moving data and handling network protocol processing, it is unable to perform computational work, and the overall productivity of the system is severely degraded. The memory copy overhead includes the resources required to copy data buffers from the network device to the kernel memory and then from the kernel memory to the application memory. This approach requires multiple memory accesses before the data is placed in its final destination. While it is not a major problem for small data transfers, it is a big problem for larger data transfers. This is where the interconnect zero-copy capabilities eliminates the memory bandwidth bottleneck without involving the CPU in the network data transfer. The interconnect bandwidth and latency have traditionally been used as two metrics for assessing the performance of the system’s interconnect fabric. However, these two metrics are typically not sufficient to determine the performance of real world applications. Typical real-world applications send messages ranging from 64 Byte to 4 Megabyte using not only point-to-point communication but a diverse Figure 8.11 Maui High Performance Computing Center mixture of communication patterns, with 1280 servers including collective and reduction patterns in the case of MPI. In some cases, interconnect vendors create artificial benchmarks, such as message rate, and apply bombastic marketing slogans to these benchmarks such as “Hyper messaging”. Message rate is yet another single point in the point-to-point bandwidth graph. If the traditional interconnect bandwidth indicates the maximum available bandwidth (single point), message rate indicates the bandwidth for message size of zero or 2 bytes. The single points of data, give some indication for the interconnect performance, but are far from describing the real world application performance. The interactive combination of those points, together with others (CPU overhead, zero copy etc.), will determine the overall ability of the connectivity solution. The difference between theoretical power and what is actually delivered is measured as processor efficiency. The more CPU cycles used to get the data out the door by “filling the wire” due to protocol and data transfer inefficiencies, the less cycles are available for the application. When comparing latencies of different interconnects, one needs to pay attention to the interconnect architecture. 1usec latency “on-loading” interconnect versus 2usec latency “off-load”

189

solution is similar to a case when one needs to decide between two cars that show the same horsepower (i.e. CPU). Both engines are capable of 200 miles per hour, but the first car, due to “onloading”, limits the actual engine power to 75 miles per hour (the engine power must be used for other tasks). The Second car has no limitations on the engine, but its wheels can tolerate only 150 miles per hour. The knowledge on the wheels tolerance (i.e. latency), as a single point of data, is definitely misleading. 8.8.1 Real Application Performance InfiniBand is a proven interconnect for clustered server solutions, and one of the leading connectivity solution for highperformance computing. InfiniBand was designed as a general I/O and in practice provides low-latency and the highest link speed. Computational Fluid Dynamics (CFD) is one of the branches of fluid mechanics that uses numerical methods and algorithms to solve and analyze problems that involve fluid flows. FLUENT provides a set of benchmark problems which represent typical current usage and covering a wide range of mesh sizes and physical models. The problems selected represent a Figure 8.12 Performance rate of two HPC for benchmark CFD range of simulations typical of Analysis those which might be found in industry. The principal objective of this benchmark suite is to provide comprehensive and fair comparative information of the performance of code. on available hardware platforms. The benchmark represents the computation of the exterior flow field around a simplified model of a passenger sedan (see Figure 8.12). The simulation geometry was used for the Japan External Aerodynamics competition. A viscous-hybrid grid with prismatic cells is used to adequately model the boundary layer regions (number of cells 3,618,080, cell type hybrid, models k-ε turbulence, solver segregated implicit). 8.8.2 Choosing the right interconnect In both cases of FLUENT benchmarks, Mellanox InfiniBand shows higher performance and better super-linear scaling comparing to QLogic InfiniPath. FLUENT’s CFD application is a latency-sensitive application, and the results shown here are good examples on how pure latency benchmarks can be misleading when choosing the right interconnect. In order to determine the system’s performance, one should take into consideration the entire interconnect architecture (such as off-loading versus on-loading) and the ability of scaling, rather than just single points of data.

8.9

Grid Computing vs. HPC

The origins of HPC/Grid exist within the academic community where needs arose to crunch large data sets very early on. Think satellite data, genomics, nuclear physics, etc. Grid, effectively, has been around since the beginning of the enterprise computing era, when it became easier for academic

190

research institutions to move away from large mainframe-style supercomputers (e.g. Cray, Sequent) towards a more scale-out model using lots of relatively inexpensive x86 hardware in large clusters. The emphasis here on relatively. Most x86 clusters today are built out for very high performance and scalability, but with a particular focus on performance of individual components (servers) and the interconnect network for reasons that I will explain below. The price/performance of the overall system is not as important as aggregate throughput of the entire system. Most academic institutions build out a grid to the full budget they have attempting to eke out every ounce of performance in each component225.

8.10 HPC vs. HSC The reality is that High Scalability Computing is ideal for the majority of EPP grid workloads. HPC is a different beast altogether as many of the MPI workloads require very low latency and servers with individually high performance. It turns out however, that all MPI workloads are not the same. The lower Figure 8.13 Scope of HPC and HSC bottom of the top part of that pyramid is filled with MPI workloads that require a great network, but not an InfiniBand network. (see Figure 8.13).

8.11 The Moral of the Story So, what we have learned is that scalable computing is different from computing optimized for performance. That cloud can accommodate grid and HPC workloads, but is not itself necessarily a grid in the traditional sense. More importantly, an extremely overlooked segment of grid (EPP) has pressing needs that can be accommodated by run-of-the-mill clouds such as EC2. In addition to supporting EPP workloads that run on the ‘regular’ cloud some clouds may also build out an area designed specifically for ‘HPC’ workloads. In other words, grid is not cloud, but there are some relationships and there is obviously a huge opportunity for cloud providers to accommodate this market segment.

8.12 HPC vs. Parallel Computing The terms "high performance computing" and "parallel computing" are ambiguous226. Parallel computing is one mechanism (resources are either added linearly O(n) or in special cases SIMD cases of O(n)2). In parallel computing, you are expected to maintain some level of consistency, and perhaps determinism. When you are doing HPC, you are putting performance at a premium. High performance computing can indeed cover parallel computing, but it can also include the efficient use of caches, TLB, SIMD instructions, and other things that are high performance but not parallel.

225 226

Randy Bias, “Grid, Cloud, HPC ... What's the Diff? “, Posted on Cloud scaling Blog, 2010. Victor Eijkhout, postdoctoral and industrial experience in HPC.

191

8.13 HPC vs. HTC There are many differences between high-throughput computing (HPC), high-performance computing (HPC). HPC tasks are characterized as needing large amounts of computing power for short periods of time, whereas HTC tasks also require large amounts of computing, but for much longer times (months and years, rather than hours and days). HPC environments are often measured in terms of FLOPS. The HTC community, however, is not concerned about operations per second, but rather operations per month or per year. Therefore, the HTC field is more interested in how many jobs can be completed over a long period of time instead of how fast.

192

9 CFD and HPC Trends Forecasted for 2030 CFD codes utilize High Performance Computing (HPC) systems, so understanding where HPC technology might be in the 2030 timeframe is an important component of creating a vision for CFD codes in 2030227. Of course, forecasting where HPC technologies will be in the future requires a significant amount of extrapolation, which is especially hard in such a fast changing area as HPC. The fastest current systems can perform more than228 peta-FLOPS (1 petaFLOPS is 1015 floating point operations per second) and the HPC community is working toward systems capable of 1018 FLOPS (exaFLOPS), which are expected sometime between 2018 and 2023. Some work is even looking at 1021 FLOPS (zetaFLOPS). However, reaching that level of performance is unlikely without radically new technologies. A common, though controversial, measure of HPC systems is the total number of floating point operations a given system can perform in a second while solving a large linear system of equations using Gaussian elimination; this is the HP LINPACK benchmark. Twice a year a list of the top 500 systems in the world against which those numbers are measured is published by the Top500 organization. The current list (June 2013) is topped by the Tianhe-2 system, developed by China’s National University of Defense Technologies, which achieved 33.86 petaFLOPS on the LINPACK benchmark. Here, we will estimate only the peak floating-point performance in terms of the maximum number of operations that can be performed per second. We note that the performance of many applications, including CFD applications, may be more accurately estimated by using sustained memory bandwidth; for the purposes, provided that other aspects of system performance remains the same

9.1 Comparison of Semiconductor fabrication sizes in HPC A significant measure of a processor is the feature size of its components. The smaller the features, the more elements can be placed in the same area, and hence the more powerful a processor becomes. Feature size also has a direct impact on power consumption, and heat generation, with smaller sizes being better. Thus, forecasting feature sizes of future processors is very important. Unfortunately, the industry has not always been good in that forecasting, which is one reason why predicting where HPC technology will be in 2030 is particularly hard. For example, in 2005, the International Technology Roadmap for Semiconductors (ITRS) forecasted a 22-nm (nm=10-9m) gate length by 2008; that is, the structures in a modern processor were forecast to have Figure 9.1 Changing Predictions About Semiconductor Sizes features with sizes around 22 nm. However, in 2008 the J, Slotnick and A, Khodadoust, J, Alonso , D, Darmofal , W, Gropp , E, Lurie , Dimitri Mavriplis , “CFD Vision 2030 Study: A Path to ,Revolutionary Computational Aero sciences” , NASA/CR–2014-218178. 228 Kraft, E. M., “Integrating Computational Science and Engineering with Testing to Re-engineer the Aeronautical Development Process”, AIAA Paper 2010-0139, 48th AIAA Aerospace Sciences Meeting, January 2010, 10.2514/6.2010-139. 227

193

forecast date moved to 2011 and in 2011, it moved again to 2012. A similar slip occurred for other (smaller) gate lengths (see Figure 9.1). Note that the forecasts of the ITRS combine inputs from all major chip manufacturers, equipment suppliers, and research communities and collections, so it represents the combined wisdom of the industry. Nevertheless, as Figure 9.1 shows, forecasting a key feature of even this basic component of processors is hard. Another critical component of HPC capability in 2030 is the advances in software infrastructure and programming methodologies that will be necessary to take advantage of these future HPC systems. The ultimate purpose for these systems is to solve the most pressing problems in academia and industry. In particular, industrial users pursue this technology because of the large impact on future product designs, and the ability to avoid or minimize the use of other, more costly methods such as wind tunnels or other types of physical tests.

9.2 Current Status of CFD At present, CFD is used extensively in the aerospace industry for the design and analysis of air and space vehicles and components. However, the penetration of CFD into aerospace design processes is not uniform across vehicle types, flight conditions, or across components. CFD often plays a complementary role to wind tunnel and rig tests, engine certification tests, and flight tests by reducing the number of test entries and/or reducing testing hours229-230. But, in many circumstances, CFD provides the only affordable or available source of engineering data to use in product design due to limitations either with model complexity and/or wind tunnel capability, or due to design requirements that cannot be addressed with ground-based testing of any kind. As a result, CFD technology development has been critical in not only minimizing product design costs, but also in enabling the design of truly novel platforms and systems. Generally, the design process is composed of three key phases: conceptual design, preliminary and detailed design, and product validation. The current usage of CFD tools and processes in all three design phases is summarized below. 9.2.1 Conceptual Design CFD is often used in the early, conceptual design of products where it has been both previously calibrated for similar applications using data-morphing techniques, as well as for new configurations where little or no engineering data is available to guide design decisions. Simplified models are typically used during the conceptual optimization phase to allow reasonably accurate trades to be made between drag, fuel consumption, weight, payload/range, thrust, or other performance measures. Use of simplified models is necessary to allow often time consuming optimization processes to be used in the overall design effort, but inherently places conservatism into the final design. This conservatism derives from the use of models that are too similar within the existing product design space, other geometric simplifications, or the use of low-fidelity CFD tools that trade off flow physics modeling accuracy for execution speed. 9.2.2 Preliminary/Detailed Design Once a product development program is launched, CFD is a necessary and uniformly present tool in the detailed configuration design process. For example, CFD is indispensable in the design of cruise wings in the presence of nacelles for commercial airplanes, and for inlet and nozzle designs; wind tunnels are used to confirm the final designs231-232. In both military and commercial aircraft design Jameson, A., “Re-engineering the Design Process Through Computation”, AIAA Journal of Aircraft, Vol. 36, 1999, pp. 36-50. 230 Goldhammer, M. I., “Boeing 787 – Design for Optimal Airplane Performance”, CEAS/KATnet Conference on Key Aerodynamic Technologies, Bremen, Germany, June 2005. 231 Malik, M. R. and Bushnell, D. M. (eds.), “Role of Computational Fluid Dynamics and Wind Tunnels in Aeronautics R&D”, NASA TP-2012-217602, September 2012. 232 Goldhammer, M. I., “Boeing 787 – Design for Optimal Airplane Performance”, CEAS/KATnet Conference on 229

194

processes, CFD is the primary source of data for aircraft load distributions and ground effect estimations. Similarly, gas turbine engine manufacturers rely on CFD to predict component design performance, having reduced the number of single-component rigs substantially as CFD capability has become more mature. Increasingly, multicomponent and Multiphysics simulations are performed during the design cycle, but the long clock times often associated with these processes restricts their widespread adoption. For space exploration, CFD is often used to gain important insight into flow physics used to properly locate external components on the surface of launch vehicles or spacecraft. CFD is also increasingly providing substantial portions of the aero and propulsion performance database. In many cases, wind tunnel data is used only to anchor the CFD data at a few test points to provide confidence in the CFD database. CFD is the primary source of data for the hypersonic flight regime when ground testing is limited or does not exist. 9.2.3 Product Validation and Certification As the product development process moves into the validation phase and certification testing, CFD is often used to confirm performance test results, assess the redesign of components that show potential for improved performance, and to answer any other questions that arise during product testing. Typically, product configurations evolve over the testing period based on a combination of measured results and engineering judgment bolstered by the best simulation capability available. In general, CFD modeling capability grows to capture the required scope and physics to answer the questions raised during testing. The expense of responding to often unplanned technical surprises which results in more time on the test stand or in flight test, or changes in hardware drives conservatism into aerospace designs and is a significant motivation for improving the accuracy and speed of CFD. If CFD is sufficiently accurate and fast, engineers can move from their traditional design space with greater confidence and less potential risk during testing. For each of these design phases, the performance of CFD is of critical. 9.2.4 CFD usage of High Performance Computing (HPC) The effectiveness and impact of CFD on the design and analysis of aerospace products and systems is largely driven by the power and availability of modern HPC systems. During the last decades, CFD codes were formulated using message passing (e.g., MPI) and thread (e.g., OpenMP) software models for expressing parallelism to run as efficiently as possible on current generation systems. However, with the emergence of truly hierarchical memory architectures having numerous graphical processing units (GPUs), new CFD algorithms may need to be developed to realize the potential performance offered by such systems. Government labs, such as Oak Ridge National Lab (ORNL), Argonne National Lab (ANL), and the NASA Advanced Supercomputing (NAS) facility at NASA Ames research center, have often led the way with the acquisition and testing of new hardware. Much research on testing and tailoring of CFD algorithms takes place on these platforms with heavy participation from academia, national labs and to some extent industry as well. Government computing resources are also used to tackle large-scale calculations of challenge problems, such as the detailed direct numerical simulation (DNS) of spray injector atomization or high fidelity simulations of transition and turbulent separation in turbomachinery. However, because of the high cost of these leadership-class systems, industry and academia often purchase smaller commodity clusters utilizing similar types of processors when the latest hardware technology is fully demonstrated on CFD problems and other important applications. 9.2.5 Turbulence Modeling Current practices for CFD-based workflows utilize steady Reynolds-average Navier-Stokes (RANS)

Key Aerodynamic Technologies, Bremen, Germany, June 2005.

195

with 1 or 2-equation turbulence models233-234, although hybrid unsteady RANS/LES methods are increasingly common for certain classes of simulations in which swirling and intentionally separated flows are dominant, such as combustors. Techniques to combine near-wall RANS regions and outer flow field, large-eddy simulation (LES) regions in these hybrid methods are immature. Many CFD design processes include an estimation of boundary layer transition, using a range of models, from purely empirical to coupled partial-differential equation (PDE) solutions of stability equations235-236. Both approaches involve much empiricism, may be missing some modes of transition, and are evolving. As a result, no generalized transition prediction capability is in widespread use in NavierStokes CFD, and the default practice is to run the codes “fully turbulent”. Steady-state CFD accounts for a vast majority of simulations while unsteady flow predictions are inherently more expensive and not yet uniformly routine in the design process, with some exceptions. 9.2.6 Process Automation Current CFD workflows are often paced by the geometry preprocessing and grid generation phases, which are significant bottlenecks. In some cases, where the design effort involves components of similar configurations, specialized, automated processes are built that considerably reduce set-up time, execution of the CFD solver, and post-processing of results. This process to production capability of the CFD workflow only occurs in areas where the design work is routine and the investment in automation makes business sense; single prototype designs and novel configurations continue to suffer the pacing limits of human-in-the-loop workflows because the payoff for automating is not evident. This issue is not unique to the aerospace industry. 9.2.7 Solution Uncertainty and Robustness In practice, CFD workflows contain considerable uncertainty that is often not quantified. Numerical uncertainties in the results come from many sources, including approximations to geometry, grid resolution, problem setup including flow modeling and boundary conditions, and residual convergence. Although NASA and professional organizations such as ASME and AIAA have created standards for the verification and validation of CFD and heat transfer analyses, such techniques are not widely used in the aerospace industry. With a few notable exceptions, CFD is carried out on fixed grids that are generated using the best available practices to capture expected flow features, such as attached boundary layers237. Such approaches cannot reliably provide adequate resolution for flow features when locations are not known a priori, such as shocks, shear layers, and wakes. Although grid refinement is often seen as a solution to addressing grid resolution issues, it is seldom done in practice because uniform refinement is impractical in 3D. Adaptive mesh refinement strategies offer the potential for superior accuracy at reduced cost, but have not seen widespread use due to robustness, error estimation, and software complexity issues. Achieving consistent and reliable flow solver or residual convergence remains problematic in many industrial cases. Although many CFD codes are able to demonstrate convergence for a few simple problems, for many flows involving difficult flow physics or complex geometries such as an aircraft in high-lift configuration, many of the current solver techniques are not strong enough to ensure robust convergence. Engineering judgment is Spalart, P. R. and Allmaras, S. R., "A One-Equation Turbulence Model for Aerodynamic Flows", La Recherche Aerospatiale, No. 1, 1994, pp. 5-21. 234 Wilcox, D. C., Turbulence Modeling for CFD, DCW Industries, 3rd edition, November 2006. 235 Stock, H.W., and Haase, W., “Navier-Stokes Airfoil Computations with eN Transition Prediction Including Transitional Flow Regions”, AIAA Journal, Vol. 38, No. 11, pp. 2059–2066, 2006, 10.2514/2.893. 236 Langtry, R. B., Menter, F. R., “Correlation-Based Transition Modeling for Unstructured Parallelized Computational Fluid Dynamics Codes”, AIAA Journal, Vol.47, pp. 2894-2906, 2009, 10.2514/1.42362. 237 Mavriplis, D. J., Vassberg, J., Tinoco, E., Mani, M., Brodersen, O., Eisfeld, B., Wahls, R., Morrison, J., Zickuhr, T., Levy, D., and Murayama, M., “Grid Quality and Resolution Issues from the Drag Prediction Workshop Series”, AIAA Journal of Aircraft, Vol. 46, No. 3, pp. 935-950, March 2009. 233

196

required to interpret results that are not well converged, which introduces conservatism into decision making. Furthermore, the use of steady-state flow solvers itself is in question for many flows of engineering interest. 9.2.8 Multidisciplinary Analysis and Optimization (MDAO) Although the basic concepts of MDAO are fairly well accepted in the community, the routine use of MDAO methods is not, by any means, universal. At moderate levels of fidelity, it is common industrial practice to perform coupled multidisciplinary analyses (MDA) of the most tightly integrated disciplines in a design. Aero structural analyses, conjugate heat transfer calculations, and aeroacoustic simulations all tend to take place in aircraft, spacecraft, jet engine, and rotorcraft analysis and design processes. High fidelity CFD is not routinely used in such MDAs, although recent years have witnessed a significant rise in the coupling of state-of-the-art CFD with additional disciplines. While frameworks for the coupling of disciplinary analyses are widely available, the ability to couple CFD with other high fidelity descriptions of participating disciplines is limited by the availability of coupling software and, more fundamentally, by a lack of general methodologies for accurate, stable, and conservative MDAs. The application of optimization techniques in industry is mostly limited to single-discipline simulations238-239. Although conceptual design tools have long benefited from multidisciplinary optimization (MDO) approaches, high fidelity CFD-based optimizations are still rare. During the past decade, the development of advanced surrogate modeling techniques and the introduction of adjoint-based optimal shape design techniques have enabled the use of CFD in aerodynamic optimization of aircraft and gas turbine components. However, the use of optimization with multiple disciplines treated using high-fidelity methods is still within the realm of advanced research and is by no means a routine practice.

9.3 Vision of CFD in 2030 as anticipated by NASA This is in fact a mirror image of the report done by USDOE, which will be covered later, but with emphasis on CFD. Given the inherent difficulties of long-term predictions, our vision for CFD in 2030 is grounded on a desired set of capabilities that must be present for a radical improvement in CFD predictions. Of special interests are critical flow phenomena associated with the key aerospace application, including commercial/military aircraft, engine propulsion, rotorcraft, space exploration, launch vehicle programs, air-breathing space-access, and spacecraft entry240. This set of capabilities includes not only the accurate and efficient prediction of fluid flows of interest, but also the usability of CFD in broader contexts (including uncertainty quantification, optimization, and multidisciplinary applications) and in streamlined/automated industrial analysis and design processes. To complicate things further, CFD in 2030 must effectively leverage the uncertain and evolving environment of HPC platforms that, together with algorithmic improvements, will be responsible for a large portion of the realized improvements. The basic set of capabilities for CFD must include, at a minimum: ➢ Emphasis on physics-based, predictive modeling. In particular, transition, turbulence, separation, chemically reacting flows, radiation, heat transfer, and constitutive models must reflect the underlying physics more closely than ever before. ➢ Management of errors and uncertainties resulting from all possible sources:

Jeffrey Slotnick and Abdollah Khodadoust, Juan Alonso, David Darmofal, William Gropp, Elizabeth Lurie ,Dimitri Mavriplis ,”CFD Vision 2030 Study: A Path to Revolutionary Computational Aerosciences”, NASA/CR– 2014-218178. 239 Same as above. 240 Same as above. 238

197

1. Physical modeling errors and uncertainties addressed, 2. Numerical errors arising from mesh and discretization inadequacies, and 3. Uncertainties derived from natural variability, as well as epistemic uncertainties due to lack of knowledge in the parameters of a particular fluid flow problem. ➢ A much higher degree of automation in all steps of the analysis process is needed including geometry creation, mesh generation and adaptation, the creation of large databases of simulation results, the extraction and understanding of the vast amounts of information generated, and the ability to computationally steer the process. Inherent to all these improvements is the requirement that every step of the solution chain executes high levels of reliability/robustness to minimize user intervention. ➢ Ability to effectively utilize massively parallel, heterogeneous, and fault-tolerant HPC architecture. For complex physical models with nonlocal interactions, the challenges of mapping the underlying algorithms onto computers with multiple memory hierarchies, latencies, and bandwidths must be overcome. ➢ Flexibility to tackle capability and capacity-computing tasks in both industrial and research environments so that both very large ensembles of reasonably-sized solutions (such as those required to populate full-flight envelopes, operating maps, or for parameter studies and design optimization). ➢ Seamless integration with multidisciplinary analyses that will be the norm in 2030. Without sacrificing accuracy or numerical stability of the resulting coupled simulation, and without requiring a large amount of effort such that only a handful of coupled simulations are possible. Included in this desired set of capabilities is a vision of the interaction between the engineer/scientist, the CFD software itself, its framework and all the ancillary software dependencies (databases, modules, visualization, etc.), and the associated HPC environment. A single engineer/scientist must be able to conceive, create, analyze, and interpret a large ensemble of related simulations in a time-critical period (e.g., 24 hours), without individually managing each simulation, to a pre-specified level of accuracy. There should be less emphasis on the mechanics of running and collecting the information, and more emphasis on interpreting and understanding the results of the work. At the moment, CFD is not yet sufficiently predictive and automated to be used in critical/relevant engineering decisions by the non-expert user, particularly in situations where separated flows are present241. ➢ Finally, we define a set of Grand Challenge (GC) problems that are bold and in fact may not be solvable in the 2030 timeframe, but are used as drivers to identify critical technologies in need of investment, and to serve as benchmarks for continually measuring progress toward the long term development goals. These GC problems are chosen to embody the requirements for CFD in 2030, and cover all important application areas of relevance to NASA’s aeronautics mission, as well as important aspects of NASA’s space exploration mission242. They are: 1. LES of aircraft configuration across the full flight envelope. 2. Off-design turbofan engine transient simulation. 3. MDAO of a highly flexible advanced aircraft configuration.

Jeffrey Slotnick and Abdollah Khodadoust, Juan Alonso, David Darmofal, William Gropp, Elizabeth Lurie ,Dimitri Mavriplis ,”CFD Vision 2030 Study: A Path to Revolutionary Computational Aerosciences”, NASA/CR– 2014-218178. 242 See Previous. 241

198

9.3.1 Technology Roadmap to achieve GC Challenge The CFD technology roadmap is a complete and concise view of the key research technologies and capabilities that must be developed and integrated into production CFD. The individual elements on the roadmap were identified based on the results of the CFD user survey, detailed technical discussions held during the Vision 2030 CFD workshop, and from interactions among our team members. Key technology milestones, proposed technology demonstrations, and critical decision gates are positioned along timelines, which extend to the year 2030. Separate timelines are identified for each of the major CFD technology elements that comprise the overall CFD process. The key milestones indicate important advances in CFD technologies or capabilities that are needed within each technology element. Technology demonstrations are identified to help verify and validate when technology advances are accomplished, as well as to validate advances toward the simulations of the GC problems identified above. Specific details of the development plan for each technology element are given below. 9.3.1.1 High Performance Computing (HPC) As mentioned previously, advances in HPC hardware systems and related computer software are critically important to the advancement of the state of the art in CFD simulation, particularly for high Reynolds turbulent flow simulations. Based on feedback from the user community survey, we envision HPC technology advancing along two separate paths. Ongoing development of exascale systems, as mentioned earlier, will continue through 2030, and represents the technology that will most likely provide the large increase in throughput for CFD simulation in the future243. However, novel technologies, such as quantum computing or molecular computing, offer a true paradigm shift in computing potential and must be carefully considered at strategic points in the overall development plan, even though the technology is at a very low level today. In order to properly address the HPC challenge, three specific thrusts must be supported. Firstly, current simulation software must be ported to evolving and emerging HPC architectures with a view toward efficiency and software maintainability. Secondly, investments must be made in the development of new algorithms, discretization, and solvers that are well suited for the massive levels of parallelism244-245. Finally, increased access to the latest large-scale computer hardware must be provided and maintained, not only for production runs, but also for algorithmic research and software development projects, which will be critical for the design and validation of new simulation tools and techniques246. We propose several key milestones that benchmark the advances that we seek: modification of NASA and related CFD codes to efficiently execute on hierarchical memory (GPU/co-processor) systems by 2020, initial evaluation of exascale performance on a representative CFD problem, and a demonstration of 30 exaFLOP performance for one or more of the proposed GC problems in the 2030 time frame. Concurrently, we stress the importance of closely observing advances in revolutionary HPC technologies, such as superconducting logic, new memory technologies, alternatives to current. Because these technologies are in their infancy, we foresee decision gates in 2020,-2025, and 2030 to establish the ability of these systems to solve a relevant model problem. Implicit in this strategy is the need to provide access to experimental hardware on a continual basis and to explore radical new Kogge, P. (Ed.), “ExaScale Computing Study: Technology Challenges in Achieving Exascale Systems”, Contractor report for AFRL Contract No. FA8650-07- C-7724, September 2008. 244 Mavriplis, D., Darmofal, D., Keyes, D. and Turner, M., “Petaflops Opportunities for the NASA Fundamental Aeronautics Program”, AIAA Paper 2007-4084,18th AIAA Computational Fluid Dynamics Conference, June 2007, 10.2514/6.2007-4084. 245 Sarkar, V. (ed.), “ExaScale Software Study: Software Challenges in Extreme Scale Systems”, DARPA, IPTO, AFRL report under contract FA8650-07-C-7724, September 2009. 246 Biswas, R., Aftosmis, M. J., Kiris, C., and Shen, B. W., “ Petascale Computing: Impact on Future NASA Missions”, Petascale Computing: Architectures and Algorithms (D. Bader, ed.), Chapman and Hall / CRC Press, 2007. 243

199

approaches to devising CFD simulation capabilities. If, at any of these decision points, the technology clearly shows its expected potential, we recommend increased investment to accelerate the use of these machines for CFD applications. 9.3.1.2 Physical Modeling Advances in the physical modeling of turbulence for separated flows, transition, and combustion are critically needed to achieve the desired state of CFD. For the advancement of turbulent flow simulation, we propose three separate tracks for research: RANS-based turbulence treatments; hybrid RANS/LES approaches where the entire boundary layer is resolved with RANS-based models, and the outer flow is resolved with LES models; and LES, including both Wall-Model and WallResolved. Details on each of the three development tracks and for transition and combustion modeling, are given below. Additionally, a longer term high-risk effort should investigate radically new approaches to physical modeling. RANS-based turbulence models continue to be the standard approach used to predict a wide range of flows for very complex configurations across virtually all aerospace product categories. They are easy to use, computationally efficient, and generally able to capture wall-bounded flows, flows with shear, flows with streamline curvature and rotation, and flows with mild separation. For these reasons, as well as the fact that RANS models will remain as an important component in hybrid RANS/LES methods, their use will continue through 2030. An advanced formulation of the RANSbased approach, where the eddy viscosity formulation is replaced with the direct modeling of the Reynolds stresses, known as the Reynolds Stress Transport method, in principle will be able to capture the onset and extent of flow separation for a wider range of flows247. Currently Hybrid RANS/LES methods show perhaps the most promise in being able to capture more of the relevant flow physics for complex geometries at an increasingly reasonable computational cost248. From the user survey, the majority of survey participants ranked the continued development of hybrid RANS/LES methods as the top priority in the area of turbulence modeling. However, as mentioned previously, several issues still exist. First, the prediction of any separation in the boundary layer will still require improvements in RANS-based methods. Second, a seamless, automatic RANSto-LES transition in the boundary layer is needed to enhance the robustness of these methods. Continued investment in hybrid RANS/LES methods to address these two critical shortcomings will be required. Additionally, more effective discretization and solvers designed specifically for LES type problems must be sought. When combined with advances in HPC hardware, these three developments will enable continued reduction in the RANS region as larger resolved LES regions become more feasible. It is fully anticipated that hybrid RANS/LES methods will become viable in production mode by the 2030 timeframe for problems typical of the proposed GCs. 9.3.1.3 Numerical Algorithms The development of novel numerical algorithms will be critical to achieving the stated CFD 2030 goals. Indeed, the proposed GCs are sufficiently ambitious that advances in HPC hardware alone during the next 20 years will not be sufficient to achieve these goals. As demonstrated in Case Study 2, even for LES of relatively simple geometries, leadership class HPC hardware in 2030 will be needed for 24-hour turnaround if existing algorithms are used. Thus, to tackle the proposed GCs, orders of magnitude improvement in simulation capabilities must be sought from advances in numerical algorithms249. The focus of investment must be on discretization and solvers that scale to massive Eisfeld, B., “Reynolds Stress Modeling for Complex Aerodynamic Flows”, Presented at the European Conference on Computational Fluid Dynamics, ECCOMAS CFD 2010, Lisbon, Portugal, June 14−17, 2010. 248 Song, F., Haase, W., Peng, S-H., and Schwamborn, D. (eds.), Progress in Hybrid RANS-LES Modeling, Springer Press, ISBN 978-3-642-31817-7, Sept. 2011. 249 Mavriplis, D., Darmofal, D., Keyes, D. and Turner, M., “Petaflops Opportunities for the NASA Fundamental Aeronautics Program”, AIAA Paper 2007-4084,18th AIAA Computational Fluid Dynamics Conference, 2007. 247

200

levels of parallelism, that are well-suited for the high-latency, deep memory hierarchies anticipated in future HPC hardware, and that are robust and fault tolerant. A well balanced research program must provide for incremental advances of current techniques (e.g., extending the scalability of current CFD methods to the exascale level whenever possible), while at the same time investing in the fundamental areas of applied mathematics and computer science to develop new approaches with better asymptotic behavior for largescale problems and better suitability for emerging HPC hardware. Discretization techniques such as higher-order accurate methods offer the potential for better accuracy and scalability, although robustness and cost considerations remain250. Investment must focus on removing these barriers in order to unlock the superior asymptotic properties of these methods, while at the same time pursuing evolutionary improvements in other areas such as low dissipation schemes, flux functions, and limiter formulations. Simultaneously, novel nontraditional approaches, such as Lattice-Boltzmann methods or other undeveloped schemes, should be investigated for special applications. Improved linear and nonlinear solvers must be developed, and here as well, the focus must be on highly scalable methods that are designed to be near optimal for the large-scale, time-implicit unsteady CFD and MDAO simulations anticipated in the future. These may include the extension of well-known matrix-based techniques, [Krylov methods]251, highly parallel multigrid methods252, or the development of completely novel approaches such as systematic upscaling methods253. Furthermore, these methods must be extensible to tightly coupled multidisciplinary problems. Investment in discretization and solvers must also consider the potential of these methods to operate on dynamically adapting meshes, to enable optimization procedures, and to incorporate advanced uncertainty quantification capabilities. In many cases, adjoint technology254-255 will be required from the outset for all of these capabilities, but the potential of other more advanced technologies such as second-order gradients [Hessians]256257 should be investigated as well. Longer term, high-risk research should focus on the development of truly enabling technologies such as monotone or entropy stable schemes in combination with innovative solvers on large-scale HPC hardware. The technology roadmap envisions the demonstration of improved robust and scalable solvers in the 2015-2017 timeframe, for both second-order and higher-order accurate methods. The demonstration of complete configuration-grid convergence technology in the 2020 time frame relies on the use of robust higher order discretization combined with improved scalable solvers and adaptive h-p refinement. Toward the 2030 time frame, it is anticipated that novel entropy stable formulations will begin to bear fruit for industrial simulations.

Kroll, N., Bieler, H., Deconinck, H., Couallier, V., van der Ven, H.; and Sorensen, K. (Eds.), “ADIGMA – A European Initiative on the Development of Adaptive High-Order Variational Methods for Aerospace Applications”, Notes on Numerical Fluid Mechanics and Multidisciplinary Design, Vol. 11, 2010, Springer. 251 Saad, Y., Iterative Methods for Sparse Linear Systems, Second Edition, SIAM, 2003. 252 Baker, A. H., Falgout, R. D., Kolev, Tz. V., and Yang, U. M., “Scaling Hypre’s Multigrid Solvers to 100,000 Cores”, High Performance Scientific Computing: Algorithms and Applications, M. Berry et al., eds., Springer (2012). 253 Brandt, A., “Multiscale Solvers and Systematic Upscaling in Computational Physics”, Computer Physics Communications, Vol 169, Issues 1–3, pp. 438-441, July 2005. 254 Jameson, A., “Aerodynamic Design via Control Theory”, ICASE Report No. 88-64, November 1988, also, J. of Scientific Computing, Vol. 3, pp. 233-260, 1988. 255 Errico, R. M., “What is an adjoint model?”, Bulletin of the American Meteorological Society, 2577– 2591, 1997. 256 Taylor, A. C., Putko, M. M., Green, L. L., and Newman, P. A., “Some Advanced Concepts in Discrete Aerodynamic Sensitivity Analysis”, AIAA Journal, Vol.41, pp. 224-1229, 2003,10.2514/2.2085. 257 Rumpfkeil, M. P., and Mavriplis, D. J., “Efficient Hessian Calculations Using Automatic Differentiation and the Adjoint Method with Applications”, AIAA Journal, Vol.48, pp. 2406-2417, 2008, 10.2514/1.J050451. 250

201

9.3.1.4 Uncertainty Quantification (UQ) With regard to uncertainty quantification, a new thrust in the area of probabilistic large-scale CFD for aerospace applications should be initiated. An initial thrust in this area should focus on enabling current aerospace CFD tools with well-known uncertainty quantification techniques, such as sensitivity analysis and propagation methods using adjoints and forward linearization, nonintrusive polynomial chaos methods, and other reduced-order model formulations258-259. Additionally, a concerted effort should be made to characterize important aerospace uncertainties and to make these available to the general research community for enabling relevant UQ research in these areas. Improved error estimation techniques must be investigated and developed, given the known deficiencies of current approaches (including adjoint methods). This will require a foundational program in the mathematics of error estimation and its application to CFD software. Finally, longer term research must focus on statistical approaches such as Bayesian techniques for quantifying more accurately modeling and other nonlinear error sources260. The technology roadmap includes an early target date of 2015 for the characterization of typical aerospace uncertainties in order to stimulate work in this area. Improved error estimation techniques will be gradually brought into the simulation capabilities and the state of these estimates will be assessed in the 2018 time frame. Comprehensive uncertainty propagation techniques including discretization error, input and parameter uncertainties in production-level CFD codes should be targeted for 2025, while the development of more sophisticated stochastic and Bayesian approaches will continue through the 2030 timeframe. 9.3.1.5 Geometry and Grid Generation Substantial new investment in geometry and grid generation technology will be required in order to meet the Vision CFD 2030 goals. In general, this area has seen very little NASA investment during the last decade, although it remains one of the most important bottlenecks for large-scale complex simulations. Focused research programs in streamlined CAD access and interfacing, large-scale mesh generation, and automated optimal adaptive meshing techniques are required. These programs must concentrate on the particular aspects required to make mesh generation and adaptation less burdensome and, ultimately, invisible to the CFD process, while developing technologies that enable the capabilities that will be required by Vision 2030-CFD applications, namely very large scale parallel mesh generation, curved mesh elements for higher order methods261-262, highly scalable dynamic overset mesh technology263, and in anisotropic adaptive methods for time-dependent problems. It is important to realize that advances in these areas will require a mix of investments in incremental software development, combined with advances in fundamental areas such as computational geometry, possibly with smaller components devoted to high risk disruptive ideas such as

Shankaran, S., and Jameson, A., “Robust Optimal Control using Polynomial Chaos and Adjoints for Systems with Uncertain Inputs”, AIAA Paper 2011-3069, 20th AIAA Computational Fluid Dynamics Conference, 2011. 259 Ng, L. W-T., Huynh, D. B. P., and Willcox, K., “Multifidelity Uncertainty Propagation for Optimization Under Uncertainty”, 12th AIAA Aviation Technology, Integration, and Operations (ATIO) Conference and 14 th AIAA/ISSMO Multidisciplinary Analysis and Optimization Conference, 2012, 10.2514/6.2012-5602. 260 Press, S. J. , Subjective and Objective Bayesian Statistics: Principles, Methods and Applications, 2nd edition, 2003, Wiley, New York. 261 Wang, L., Anderson, W. K., Erwin, J., and Kapadia, S., “High-order Methods for Solutions of Three dimensional Turbulent Flows”, AIAA Paper 2013-856, 51st AIAA Aerospace Sciences Meeting, Jan 2013. 262 Persson, P-O., Willis, D., and Peraire, J., “The Numerical Simulation of Flapping Wings at Low Reynolds Numbers”, AIAA Paper 2010-724, 48th AIAA Aerospace Sciences Meeting, Jan 2010. 263 Pulliam, T. H. and Jespersen, D. C., “Large Scale Aerodynamic Calculation on Pleiades”, Proceedings of the 21st International Conference on Parallel Computational Fluid Dynamics, Moffett Field, California, May 18−22, 2009. 258

202

anisotropic cut-cell meshes264, strand mesh ideas265, and even meshless methods266. Additionally, because significant technology currently resides with commercial software vendors, particularly for CAD interfaces and access, involving these stakeholders in the appropriate focused research programs will be critical for long-term success. Innovative approaches for achieving such partnerships must be sought out, such as the formation of consortiums for the definition and adoption of standards or other potential issues such as large scale parallel licensing of commercial software. The technology development roadmap envisions the demonstration of tight CAD coupling and production adaptive mesh refinement (AMR) in the 2015-2017 time frame, followed by maturation of large-scale parallel mesh generation in the 2020-2025 time frame, and leading ultimately to fully automated in-situ mesh generation and adaptive control for large-scale timedependent problems by 2030. 9.3.1.6 Knowledge Extraction Petascale and exascale simulations will generate vast amounts of data and various government agencies such as the NSF and DOE have instituted major programs in data-driven simulation research. In order to make effective use of large scale CFD and MDAO simulations in aerospace engineering, a thrust in data knowledge extraction should be initiated. Ideally, this should contain three components, a visualization component, a database management component, and a variable fidelity, data integration component. Methods to process and visualize very largescale unsteady CFD simulations in real time, including results from higher-order discretization, are required to support the advanced CFD capabilities envisioned in 2030. Although many of the current efforts in maturing visualization technology are being led by commercial vendors who continue to supply enhanced capabilities in this area, more fundamental research to directly embed visualization capabilities into production CFD tools optimized for emerging HPC platforms is needed to achieve real-time processing267. Moreover, the CFD capability in 2030 must provide the analyst with a more intuitive and natural interface into the flow solution to better understand complex flow physics. Foreseeing the capability of generating large databases with increasing computational power, techniques for rapidly integrating these databases, querying them in real time will be required. Finally, integrating high fidelity simulation data with lower fidelity model data, as well as experimental data from wind tunnel tests, engine test rigs, or flight-test data will provide a powerful approach for reducing overall risk in aerospace system design268. Techniques for building large-scale flexible databases are in their infancy, and range from simple software infrastructures that manage large numbers of simulation jobs to more sophisticated reduced-order models269, surrogate models, and Kriging methods270. The objective of a research thrust in this area should be to apply existing techniques to current CFD simulation capabilities at a large scale, while simultaneously performing foundational research in the Modisette, J., and Darmofal, D., “Toward a Robust, Higher-Order Cut-Cell Method for Viscous Flows”, AIAA Paper 2010-721, 48th AIAA Aerospace Sciences Meeting, Jan 2010. 265 Katz, A., Wissink, A., Sitaraman, J., and Sankaran, V., “Application of Strand Meshes to Complex Aerodynamic Flow Fields”, AIAA Paper 2010-4934, 28th AIAA Applied Aerodynamics Conference, June 2010. 266 Katz, A., and Jameson, A, “Meshless Scheme Based on Alignment Constraints”, AIAA Journal, Vol.48, pp. 25012511, 2010. 267 Wang, Y., Yu, H., and Ma, K-L, “Scalable Parallel Feature Extraction and Tracking for Large Time-Varying 3D Volume Data”, Proceedings of EGPGV 2013, May 2013, pp. 55-62. 268 The 1st Workshop on Integration of Experimental Fluid Dynamics (EFD) and Computational Fluid Dynamics (CFD), JAXA Special Publication SP-09-002, January 2010. 269 Washabaugh, K., Amsallem, D., Zahr, M., and Farhat, C., “Nonlinear Model Reduction for CFD Problems Using Local Reduced-Order Bases”, AIAA Paper 2012-2686, 42nd AIAA Fluid Dynamics Conference, June 2012. 270 Han, Z-H., and Görtz, S., “Hierarchical Kriging Model for Variable-Fidelity Surrogate Modeling”, AIAA Journal, Vol.50, pp.1885-1896, 2012, 10.2514/1.J051354. 264

203

development of better reduced-order models and variable fidelity models that are applicable to aerospace problems and can support embedded uncertainty quantification strategies. The technology roadmap envisions the demonstration of real time analysis and visualization of a notional 1010 point unsteady CFD simulation in 2020, and a 1011 point simulation in 2025. These technology demonstrations would be an integral part of the GC problems designed to benchmark advances in other CFD areas. The development of reduced-order models and other variable fidelity models will entail long term research and will likely remain an active research topic past the 2030 time frame. However, the technology roadmap envisions the periodic assessment of the state-of-theart in these areas at 5 to 10 year intervals, with investment directed toward demonstrating promising approaches on large-scale aerospace applications. 9.3.1.7 Multidisciplinary Design and Optimization The ability to perform CFD-based multidisciplinary analysis (MDA) and analysis/optimization (MDAO) relies on the availability of future capabilities that need to be developed between now and 2030. Pervasive and seamless MDAs (that can be routinely exercised in industrial practice for configuration studies, e.g., full aero-thermo-elastic/aero-acoustic simulations of entire airframe/propulsion systems including shielding) will require the development of accepted standards and APIs for disciplinary information and the required multidisciplinary couplings (such as with acoustics, combustion, structures, heat transfer, radiation). A concerted effort is envisioned that results in a set of standards available to the community around 2016. In parallel with this effort, it will also be necessary to develop high-fidelity coupling techniques that guarantee the accuracy and stability of high fidelity, tightly coupled MDAs271, while ensuring that the appropriate conservation principles are satisfied with errors below acceptable thresholds. This capability, together with the coupling software that includes such information transfers must be available around 2018. Together, the standards and the coupling techniques/software would enable demonstrations of two-way coupled MDAs with the best and most robust existing CFD solvers of the time, and guaranteeing coupling fidelity by the year 2020. Such demonstrations can focus on multiple aerospace problems of interest, including aircraft aero-structural/aero-elastic analyses, aircraft aero-acoustics, rotorcraft aero-structural and aero-acoustic couplings, unsteady combustion, reentry aerothermodynamics and material response, and the like. Initially, such routine MDAs would focus on portions of an entire vehicle (around 2020) and would transition to the treatment of the entire system around 2025. A number of capabilities also must be developed in order to enable MDAO with and without the presence of uncertainties (robust and reliability-based design). A major research component that is likely to span a significant period (2015 -2025) is the work needed to endow industrial strength CFD solvers with both gradient calculation and uncertainty quantification capabilities for use in multidisciplinary optimization. Some of this work has been described in the “Numerical Algorithms” section. For the gradient/ sensitivity analysis capability, we envision that the CFD solver will be able to compute this information for full unsteady flows for the turbulence models available at the time. Finally, all these new capabilities must come together on a series of MDAO grand-challenge demonstrations in the 2030 timeframe. 9.3.2 Recommendations In order to effectively execute the CFD development plan described above and achieve the goals laid out in the vision of CFD in 2030, a comprehensive research strategy and set of recommendations are presented. This research strategy calls for the renewed preeminence of NASA in the area of 271 “Multiphysics

Simulations: Challenges and Opportunities”, Argonne National Lab Report ANL/MCS-TM-321, Report from Workshop sponsored by the Institute for Computing in Science (ICiS), Park City, Utah, June August, 2011.

204

computational sciences and aerodynamics, and calls for NASA to play a leading role in the pursuit of revolutionary simulation based engineering. Aerospace engineering has had a long history of developing technology that impacts product development well beyond the boundaries of aerospace systems. As such, NASA is a critical force in driving technology throughout aerospace engineering directly by fulfilling its obligation. Computational methods are a key example of this broad impact, as NASA has historically been a leader in the development of structural finite-element methods, computational fluid dynamics, and applications of HPC to engineering simulations. NASA’s effort must be targeted toward research and technology development that can make revolutionary impacts on simulation-based engineering in the aerospace sciences. In particular, the current state of CFD is such that small, incremental improvements in existing capability have not had revolutionary effects. In an environment of constrained resources, this will require that NASA evaluate its activities with a critical eye toward supporting those efforts whose impact could be revolutionary. To ensure that the technology plan and roadmap are as effective as possible, we propose specific recommendations (see Figure 9.2). Naturally, individual research thrusts affect multiple technical areas, which in turn affect the ability to meet various milestones and progress toward the GC problems. GEOMETRY AND GRID GENERATION • CAD access and interfaces • Large scale parallel mesh generation • Adaptive mesh refinement • Curved mesh elements for higher‐order

HPC • Increasing access to leading‐edge HPC hardware • Porting of current and future codes to leading‐edge HPC • Radical emerging HPC technologies

Figure 9.2

MDAO • Interfaces and standards • Accurate and stable coupling techniques • UQ support and sensitivities (systemlevel)

PHYSICAL MODELING • RANS turbulence modeling • Hybrid RANS‐LES modeling a. Improved RANS component b. Seamless interface • LES (wall‐modeled and wall‐resolved) • Transition • Combustion • Radically new modeling approaches

KNOWLEDGE MANAGEMENT • Visualization • Data‐base management • Variable fidelity models

NUMERICAL ALGORITHMS • Advances in current algorithms for HPC • Novel discretizations a. Higher‐order methods b. Low dissipation/dispersion schemes c. Novel approaches (foundational) • Solvers a. Linear and non‐linear scalable solvers b. Enhancements for MDAO and UQ • UQ a. Define aerospace uncertainties b. Leverage kn own techniques c. Improved error estimation techniques d. Statistical approaches

Proposed New Computational Sciences Program Structure

205

9.4

HPC Envisioned by Department of Energy (DOE)

The aim here is whether or not to pursue the main issues raised by ‘going to the exascale’, and to provide some guidance on the level of risk involved in pursuing272, and not pursuing, this direction of high performance computing. ‘Going to the exascale’ will mean a radical change in computing architecture basically. It vastly increasing the levels of parallelism to the point of millions of processors working in cycle which will force radical changes in how hardware is designed. It will dictate in how we go about solving problems (e.g., the application codes), and in how we marry application codes to the underlying hardware (e.g., the compilers, I/O, middleware, and related software tools). Understanding the advantages to be gained by going to the exascale, and evaluating the risks involved by going down this path, requires both an evaluation of past experiences in moving from the megaflop era to the present petaflop era, as well as an assessment of the readiness of advanced applications to take transformative advantage of exascale computing. The challenges inherent in developing exascale computing as a practical endeavor are considerable, and significant investments will be needed to accomplish this. 9.4.1 What is Exascale Computing? Exascale computing refers to computing systems capable of at least one exaFLOPS, or a billion x billion calculations per second273. Such capacity represents a thousand fold increase over the first petascale computer that came into operation in 2008274. (One exaflops is a thousand petaflops or a quintillion, 1018, floating point operations per second.) At a supercomputing conference in 2009, Computerworld projected exascale implementation by 2018. Exascale computing would be considered as a significant achievement in computer engineering, for it is believed to be the order of processing power of the human brain at neural level (functional might be lower). It is, for instance, the target power of the Human Brain Project275. 9.4.2 Why Exascale? The most obvious question, the key question really is of course: why go to the exascale? This question is not meant in the trivial sense that one would pose for any expenditure whatsoever in leading edge computing technologies, but rather is motivated by the fact that the transition from current petascale computing to the exascale will involve investments across the board from hardware to fundamental algorithms, programming models, compilers, and application codes that will dwarf previous levels of investment made as computer architectures have evolved in the past. That is, we recognize that the values to society extracted from this change in computing paradigm has to be commensurate with the costs of developing this type of computing and given the substantial costs, we need to be sure that the extracted values are similarly substantial. We will make the argument in the following that the extracted values are in fact very large but will do so in two stages, first by making some general points about the present frontiers of computing independent of discipline and then by focusing on a few example disciplines to illustrate the more general point. 9.4.3 Range of Applications may be Transformed by Going to the Exascale As discussed earlier, a key question to be addressed in considering going to the exascale is the readiness of key applications to take this step, as well as the likelihood that taking this approach will The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 272

From Wikipedia, the free encyclopedia. National Research Council, “The potential impact of high-end capability computing on four illustrative fields of science and engineering”, The National Academies. p. 11. ISBN 978-0-309-12485-0. 275 The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 273 274

206

lead to transformative changes in these application areas. This question is addressed in the present section, focusing once again on a selection of disciplines to illustrate the breadth of applications that are ready for this transition. 9.4.3.1 Aerospace, Airframes and Jet Turbines Computing at an extreme scale will have transformational effects on several key applications in the aerospace industry276. The move from RANS to LES as the industry standard and its use in the design cycle represents a paradigm shift for the aerospace industry. In addition, there are several outstanding scientific problems in these sectors that can be understood and hopefully controlled using extreme scale computing. The accuracy achieved with the RANS approach for prediction of quantities of engineering interest in the airframe industry has reached a plateau owing to the epistemic uncertainties inherent in such turbulence models. As a result, the design of aircraft and propulsions systems relies on an iterative process where several expensive prototypes are constructed and tested in wind tunnels. Hybrid RANS/LES approaches with grounding in the first principles can overcome the limitations of RANS and enhance the predictive capability of CFD beyond the present seemingly stagnant state of speculative trial-and-error in design277. In addition, building a complete flight-envelope characterization (accounting for irreducible uncertainties, e.g. angle-ofattack, flight conditions, and geometry) will only be possible with computing at the Exascale and beyond. Such a design framework for aerodynamically optimized vehicles and propulsion systems is a critical resource for the design and construction of next generation aircraft and propulsion systems. Figure 9.3 provides estimates of the computing requirements to meet these design goals to address several Grand Challenges in aerospace systems where the computer speed and memory requirements

Figure 9.3

Computer speed and memory requirements for the Grand Challenge

See Above. Wall-Modeled LES (WM-LES) and hybrid RANS-LES methods provide a clear path to first-principles design of next-generation aircraft as exascale computing arrives. Transitioning this technology to future exascale platforms will have a transformative impact upon simulation-based engineering design, making possible the design of aerodynamically optimized vehicles including integrated effects of propulsion, structures, and active controls, a “Grand Challenge” of aerodynamic design. 276 277

207

for analysis and design of airfoils, wings, and complete aircraft for three different stages of approximation. One of the major problems confronting the aircraft industry is the aerodynamic noise generated by engine exhaust jets and airframes, particularly during take-off and landing approaches. The noise problem has been a major issue for high-speed commercial aircraft and more recently for military aircraft, both for impact on communities surrounding airports and military bases, and on the crew stationed on aircraft carrier decks. It is known that turbulence is a major contributor to aircraft noise. Unfortunately, modern optical diagnostic techniques are far from adequate in measuring the spatialtemporal data needed to reveal the mechanics of aerodynamic noise; only high-fidelity simulation techniques, such as LES, are capable of predicting both the far-field noise as well as details of the noise generating turbulent eddies. Exascale computing would have transformational impact on the discovery of the mechanics of noise generation, and would be instrumental in designing noise mitigation strategies. Figure 9.4 shows the turbulent flow from a supersonic exhaust jet (M = 1.7) obtained from a breakthrough state of the art LES computation in 2010. This first-of-a-kind of calculation lacks high-fidelity representation of the flow inside the nozzle, and the agreement with the measured noise data is only fair, presumably due to this inadequate grid resolution. As exascale computing tools become available, high-fidelity tools would not only be used to understand and predict flow-generated noise, they will be used to learn how to control it. Such demonstration calculations have been extremely computer intensive, and limited to very simple flows. Exascale computing would be the enabling technology for complex flow control and shape optimization (e.g., of aircraft wings and nozzle exits), potentially leading to a major transformational effect on the aerospace industry. The other outstanding technical problems in the gas-turbine industry is the migration of hot fluid parcels from the combustor to the turbine. The hot-streak migration is a limiting factor in the design of turbines, as turbine blades, designed based on mean flow temperatures, are damaged severely when encountering the migrating hot-spots. High-fidelity simulation of the flow inside the combustor of a jet engine is a daunting task due to the multi-physics phenomena present. Even in the modern LES computations of combustors using petascale class computers, reduced order models are used for critical phenomena such as primary atomization of the injected liquid fuel into micron size droplets, the evaporation process of the droplets and the chemical mechanisms involved. Exascale computing

Figure 9.4

A supersonic Jet Engine Nozzle Rapidly Accelerates High-Pressure Gas into the Atmosphere

208

would be the enabling technology for simulation of the jet engine combustors based on first principles, which in turn promises to facilitate the discovery of mitigating strategies for the suppression of the hot-streak migrations278. 9.4.3.2 Combustion Reliable prediction requires, for example, the incorporation of heterogeneous kinetics with quantified uncertainties in turbulent combustion simulations for processes such as soot formation/burnout and increased fidelity coupling of high-pressure, low-temperature chemistry with turbulent transport and these vital enhanced modeling techniques will only be feasible at exascale computing performance levels. In particular, combustion scientists must focus on the science underlying the development of non-petroleum based fuels, including carbon-neutral biofuels, and their optimal use in transportation. This science intrinsically involves chemistry with transport at conditions far from equilibrium and at extreme pressures and a coordinated multi-scale approach for understanding and predicting combustion in turbulent environments279. Combustion in practical devices covers a myriad of time and length scales, from the scale of the electron to those of the largest scales of turbulence dependent upon the geometry of the device. To tackle this daunting challenge and complexity, a multi-scale approach is adopted wherein experiments, theory and direct computation are brought to bear on a limited range of scales (4-5 decades) and fundamental physical insights gained are encapsulated in reduced-order parameterizations that are used to upscale knowledge to bridge the scales. Several high-fidelity computational approaches in both the atomistic and continuum regimes utilize petascale computing. Exascale computing would greatly facilitate higher fidelity or access to more practically relevant parameter regimes (e.g., higher pressure, higher turbulence levels, and more complex fuels). In the continuum regime where turbulence scales interact with flame, ignition, and mixing scales turbulence-chemistry interactions are important. Virtually all combustion devices operate under turbulent environments due to enhanced mixing and greater efficiency. Many of the fundamental turbulence chemistry interactions are amenable to investigation by first principles direct numerical simulation (DNS) and high-fidelity large-eddy simulation (LES) of building block, laboratory scale flows. Whereas DNS focuses on the fully resolving the fine-grained physics, LES resolves the energycontaining end of the turbulence spectrum down to a specified cut-off in the inertial or dissipative end of the spectrum and the unresolved sub-grid scales are modeled. As such these methods are complementary. Both DNS and LES require the horsepower of high-performance supercomputing at the exascale and beyond to resolve all relevant flow and chemical scales. Exascale simulations are required, for example, to understand the coupling between low-temperature ignition kinetics and turbulent mixing at high pressure that determines lifted flame stabilization, ignition timing, rate of combustion, and emissions characteristics. Understanding complex low-temperature high pressure kinetics of alternative fuels and its coupling with turbulent transport at high pressure requires much greater resolution and the transport of large numbers of reactive scalars only afforded by extreme scale computing power. Moreover, in-situ reduction strategies for accurate and computationally affordable inclusion of heterogeneous kinetics with quantified uncertainties in DNS and LES are required. The insights gained from exascale simulations will enable the development of predictive multi-scale models to optimally design future evolving fuels and engines. Future predictive simulation tools running on exascale computing systems will enable deep understanding of underlying chemical and combustion science processes, enhance combustion engine design and performance, and ultimately yield a dramatic reduction in engine development timescales, time to market, and development costs, while ensuring the timely achievement of energy The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 279 See Previous. 278

209

security and emissions goals, and enhancing the competitiveness of U.S. engine manufacturers and fuel producers. 9.4.3.3 Climate Modeling Although substantial uncertainty exists as to the degree and impacts of future climate change, especially at local and regional scales, it is generally agreed that significant adaptation will be required. Furthermore, the magnitude of climate change later in the century depends upon the near and intermediate-term mitigation strategies used to reduce the emission of greenhouse gases. These strategies also must satisfy an increasing energy demand of a growing global population experiencing an improvement in its standard of living. Predicting these future climate changes and evaluating the effects of mitigation strategies require Earth system models (ESMs) that are far more accurate and comprehensive than those in use today. Integrated assessment models provide the framework for climate predictions by defining the emissions scenarios and elucidating the relationships among the natural and human systems that are at the core of climate change studies. In the next decade, integrated assessment and comprehensive ESMs will probably be combined into a single system that could be used to investigate scientific issues and to formulate policy options for adaptation and mitigation. The predictions from integrated ESMs will be most credible if the important processes in the climate system, for example mixing by ocean eddies, are simulated at their native spatial and temporal scales. Critical organized features in the atmosphere and ocean including clouds and eddies have characteristic sizes of 1 to 10 km. Some of the major sources of uncertainty in climate predictions from existing models are associated with the aggregate effects of these phenomena. Experience with current climate models suggests that simulation of climate change with a model with 10-km grid resolution is inherently a petascale problem. In fact, even higher resolution is required to resolve these features with sufficient fidelity to the physical principles underlying their formation and evolution. Since the computational cost increases nonlinearly with higher resolution, it is likely that predictions of societal and environmental change at 1-km resolution would require truly extreme scale computers. 9.4.3.4 Computational Biology The ultimate goal of exascale computing applications to challenges in modern biology is to go from atoms to organs or from microbes to ecosystems: for example, to enable an understanding of how the brain works as an energy efficient, biologically-based information system, or to understand microbial processes and their impact on the geosphere. In the process, these newly enlarged scales of computing will resolve unfathomably complex research issues in a host of fields as diverse as neuroscience and microbial metagenomics. At exascale, new scalable tools that admit a variety of time, space and trajectory sampling methods (and fully exploit the hundreds of millions of cores of an exascale machine) will enable long time integrations, implicit solvation conditions, and mixed molecular mechanics and quantum mechanics models, to allow breakthrough science. For example, a large biochemical network within a full-scale model of a eukaryotic cell could be modeled in the span of a few hours. It is important to note that the first million-atom simulation in biology was conducted just five years ago an all-atom simulation of the ribosome conducted at Los Alamos National Laboratory. This million particle simulation milestone had already been achieved a decade prior in materials science and cosmology (computational scientists in both these fields now perform multibillion-particle simulations). While biology researchers have achieved impressive methodological advances that permit the modeling of the largest assemblies in the cell, it is only for short periods of time. And, these simulations are unlikely to scale to the size of a single cell, even a small bacterium, for relevant times such as minutes or hours even if researchers can employ computers capable of achieving 1,000 petaflops/s. Today, researchers are currently limited to the microsecond timescale for protein

210

folding required by the huge number of intermolecular, interaction computations. Scientists also lack rigorous coarse grained models that permit the scaling up of macromolecular pathways and supramolecular cellular processes. Similarly, systems biology methods lack the dynamic resolution needed for coupling genomic and other data in order to fully map cellular networks, to predict their functional states, and to control the time varying responses of living cells. Nor can current kinetics models adequately analyze the dynamics of complex living systems. Exascale computing will be needed to achieve those capabilities. Within the next decade, scientists expect to have the complete genome sequence of more than 10,000 bacteria and archaea and other single-celled microbes. Exascale computing platforms will make it possible in principle to systematically reconstruct the metabolic networks of all sequenced microbes through automated comparative analysis, to reconstruct their regulatory networks by integrating a variety of data sources, and to combine these reconstructions into functional models of cellular states. Exascale computing will be critical to make this a routine class of computation such that it can become part of the standard way we analyze genomes in the future. 9.4.3.5 Materials Science Materials innovations are central to many of the technological advances responsible for the quality of life and prosperity. In fact, many of the disruptive technological advances since the turn of the last century modern transportation, medical treatments and prosthetics, space exploration, global communication, computers and the electronics industry used advances arising from every corner of the materials world: metals, ceramics, semiconductors, polymers, and novel combinations of these. Materials establish and support entire industries, and tens of millions of manufacturing jobs depend on the availability of these advanced materials at affordable costs. A quantifiable understanding of novel materials and their response is central as well to the technological challenges facing our country. Whether it is ceramics for high-efficiency automobiles, photovoltaics for next-generation solar power or smart alloys for efficient building construction, the nation requires the development of advanced materials with superior properties that will drive the next generation of technologies. In the highly competitive global marketplace that we find ourselves, minimizing time to solution and time to market is crucial. It is instructive to consider two workhorse techniques for materials modeling hydrodynamics and molecular dynamics and examine the reasons why a simulation might fail to provide sufficiently useful information. Molecular dynamics simulations are characterized by a force field or potential, involving many adjustable parameters, which describes the interactions between atoms. There are no parameters required to describe the response of the materials, however all the constitutive response emerges naturally from the interaction potentials. Such calculations are currently limited in size to fractions of a cubic micron simulated for 10’s of nanoseconds, even on the largest computers. Hydrodynamics, by comparison, involves many adjustable parameters describing both interaction and the materials response. However, there is no real size or time limit in the simulation. There is a practical lower limit on resolution, as it makes no sense to model a atomically sized region of space using continuum equations. At a given level of computing, computational scientists using either method encounter two common barriers to success: (a) the largest (or most finely resolved) simulation possible is still too small (or too poorly resolved) to capture the relevant behavior of interest, or (b) the most complex, compute-intensive simulation that can be solved in a reasonable time is still too simple or approximate to adequately describe the physics of interest. In many cases both (a) and (b) are true which is particularly damning, since it prevents the investigator from performing the traditional trade-off between these two constraints: very often, one makes simplifying approximations to enable a larger simulation or investigates smaller systems in order to perform a more complicated calculation. On the other hand, investigating grain formation using molecular dynamics may not be possible, even in the simplest metals on today’s computers. The availability of an exascale platform will move the

211

location of the constraints, allowing quite generally more detailed calculations of more complex materials. State of-the-art calculations involving billions of atoms have been performed that demonstrate the ability to model macroscopic (i.e., continuum) materials behavior with an atomistic model that makes no assumptions about the cooperative response. Figure 9.5 shows a detail view of 9 Billion-atom molecular dynamics simulation of a developing Kelvin-Helmholtz instability at the sheared interface between aluminum and copper. With the development of an exascale computer it is Figure 9.5 Detail View of 9-Billion Atom possible that such a calculation (which was heroic Molecular Dynamics Simulation Instability on a petascale computer) could be performed on demand during a hydrodynamics calculation, determining, for example, the equation of state for a mixed region at precisely the temperature, pressure and composition that was required. By tabulating this information as it is generated, one can envision that such a simulation would teach itself as it runs, learning only those regions of this three dimensional phase space that is needed. 9.4.3.6 Nuclear Engineering Recent studies have reviewed the status and basic science, challenges, opportunities, and research needs for advanced nuclear energy systems, with specific attention to the role of predictive modeling and simulations (M&S) in addressing the difficulties posed by the radioactive materials and harsh environments found in these systems: • •

Computational M&S offers the opportunity to accelerate nuclear energy development by simulating complex systems to evaluate options and predict performance, thus narrowing the technology path and optimizing testing requirements. Today’s high-performance computational systems are capable of modeling complete reactor systems and related technologies; the availability of exascale systems will enable high-fidelity M&S that can further improve the performance of existing reactors and have a significant positive impact on both the design and the operation of future reactors.

Simulation has the potential for addressing the critical needs of advanced nuclear energy systems by providing the tools necessary for safety assessments, design activities, cost, and risk reduction. One can, for example, imagine virtual prototyping of reactor cores yielding data that leads to more accurate identification of design margins, allows early experimentation with novel design concepts, and ultimately significantly reduces plant certification timelines. In other areas, such as advanced fuel fabrication, atomistic fuel simulations could ultimately make it possible to target a small subset of promising candidate fuel types for further experimentation, greatly reducing the number of experiments to be performed. A simulation-based methodology is within reach with exascale computers. The scope of the M&S tools needed to support the design, analysis and engineering of next-generation nuclear energy systems is daunting: 1. 2. 3. 4.

Integrated 3D reactor core simulations with rigorous propagation of uncertainty; Coupled thermal hydraulic and primary loop simulation; Advanced fuel design and performance; Fuel behavior engineering;

212

5. 6. 7. 8. 9.

Advanced secondary loop and balance of plant engineering and analysis; Advanced fuel cycle design; Separations facility engineering optimization; Repository design including seismic, geological, chemical, and thermal modeling and simulation; Overall nuclear energy systems model development suitable for alternative economic analysis.

Spent fuel reprocessing is very complicated with a large number of different materials, multiple pathways must be considered; waste streams must be treated; improve coupling between computations and experiments must occur. Reprocessing occurs at high temperature, and is in dire need of better multi-scale M&S. The opportunities for impact on reprocessing with exascale M&S abound. These include developing new separation agents, full-scale plant simulations using first principles, integrating multiple codes, and separations simulations. Empirical understanding does not lead to appropriate scale up it will instead require exascale computing. Some of the payoffs for exascale computation include: reduced R&D cost and time; improved/accelerated design; process scaleup; reduced facility cost; opportunity for major change; and waste form design. Many challenges confront viable and useful (predictive) M&S of fuel performance. These include the ability to reduce fuel development and qualification time, assess life cycle performance, address safety concerns, predict fuel rod behavior in design basis accident (DBA), and predict current and advanced (e.g., transuranic) fuel behavior. Important effects and requirements to incorporate include material properties, swelling, microstructural phase change, thermal properties, crack formation and mechanical property change. High-fidelity modeling of fuel performance is inherently multiscale, e.g., the effects of point defects and fission products must be considered. Exascale platform requirements drivers in fuel performance can be quantified. Opportunities for exascale M&S of existing and future advanced reactors include eliminating unrealistic assumptions that drive to more conservative designs and thus higher installation cost, helping to achieve higher power efficiencies, a reduction of learning curves to get efficiencies, helping to reduce the required number of repositories, improving safety posture, optimizing design of the power grid and the fuel cycle and better (more efficient) operations, including in-line monitoring and operator training. There are numerous issues confronting advanced reactor M&S today. The core is a coupled physics problem (not currently being done very well today) and the full system needs to be analyzed in one tool. Current reactor designs are excessively conservative. 9.4.3.7 Others Disciplines Other frequently mentioned disciplines which will be realty impacted by Exascale are: • • •

Astrophysics Fusion Energy National Security

Users should consult the report “The Opportunities and Challenges of Exascale Computing” of USDOE for further information. 9.4.4 Challenges in Going to the Exascale Creating an exascale computer capable of effectively running the applications just described will require significant R&D breakthroughs. The previous section laid out the case for the wide range of scientific and technical advances that could be made with an exaflop computer. This section discusses the challenges required to make that three order of magnitude jump in technology. In this type of discussion, it is often far too easy to talk about that jump as some quantitative steps in an evolutionary process, when in fact the jump implies significant qualitative changes in the way solutions must be approached. Consider the following Table 9.1 illustrates three orders of

213

magnitude in change. The analogy to computing challenges is not quite the same, because we do not have to explore totally different technologies to make the leap in three orders of magnitude. However, just like we would not think of asking a marathon runner to explore the solar system, we cannot use current technology to produce an Exaflop system. Below we highlight the important steps necessary to take this giant step while users could consult the [Report on Exascale Computing]280 for additional information. Technology

Quantitative Rate

Qualitative Change

Marathon Runner

10 mph

Explore a town

Car

100 mph

Explore a country

Jet

1000 mph

Explore a world

Space Craft

10000 mph

Explore the solar System

Table 9.1

Three Order of Magnitude Jump

9.4.4.1 The Hardware Challenges The architectural challenges for reaching exascale are dominated by power, memory, interconnection networks and resilience. Table 9.2 compares current HPC designs with potential exascale designs from the DOE281. The baseline we need is a factor of 500 change in peak system performance. The difference in factor changes for the various components show where simple scaling of systems (e.g., buying 500 2 Pf/s systems) will be inadequate.

Table 9.2

Potential Exascale Computer Design for 2018 and its relationship to current HPC designs (DOE)

The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 281 See Previous. 280

214

Take for example, the power line in the table. While the peak speed goes up by 500, the power cost cannot go up by more than a factor of 3. That means that the power solution for an exaflop system has to be over 150 times more efficient than current technology. That is a huge challenge. Looking through the other entries, the table clearly echoes the sentiments of the IAA, and highlight key features that must be addressed in hardware or downstream in software. Other potential challenges are: • • • • •

Exaflop hardware needs major R&D progress Power System Memory Data Movement System Resiliency

9.4.4.2 The Applied Mathematics Challenges The applied mathematics component of an exascale program should include attention to activities with time horizons ranging from medium term to very long term, where both ends of the time scale are essential. The description “medium-term” is deliberate because experience in adapting to new computational modalities shows that short-term, one-off strategies are likely to be wasteful. Even though much remains unknown about the details of exascale systems, a clear medium-term priority is the definition and implementation of algorithms that are scalable at very large levels of parallelism (such as on million-core machines) and that remain sufficiently fast under different hardware decisions about bandwidth and latency. Scalability should be modeled and analyzed mathematically, using abstractions that represent key architectural ingredients. Simulations and experiments that indicate the effects of hardware and software perturbations on algorithmic efficiency can then guide the definition of methods that retain scalability under a variety of hardware scenarios. In this spirit, the strategies for applied mathematics in exascale science will require sustained support over time for people-intensive activities, early identification of the hardest (and least straightforward) research problems, and built-in flexibility to pursue unexpected and promising new directions as they arise. Some other points important but not discussed here are: 9.4.4.3 Mathematical Modeling It is natural for those developing mathematical models of practical problems to limit themselves to formulations that can be solved numerically using currently available methods. Although essential when the problem needs to be solved in the short term, an ab initio focus on feasibility can create a too-rigid environment in which non-standard or “blue-sky” formulations are either never thought about or else summarily rejected. For example, a problem formulation that represents many realworld problems yet tends to be avoided because of its known intractability is constrained nonlinear optimization with a mixture of continuous, discrete, and categorical variables. But the prospect of massive increases in computational power means that modeling ideas previously dismissed as impossible or impractical may well become realistic, and should be carefully examined and analyzed. Creative rethinking of mathematical models is an essential strategy to address the challenges of exascale science. The highly desired “transformational” changes flowing from exascale computing are most likely to come from new formulations that change the way we think about problems, rather than from applying more resources to an existing formulation to obtain a more accurate solution or to solve a larger problem. Mathematical models are inherently an approximation of reality, and an exascale initiative provides an opportunity to loosen the grip of, or even remove, computationallyimposed simplifications. The major challenge is to devise models that capture the important details of physical and engineered systems as they really are. This will almost certainly generate much harder sub-problems and/or much more data, but the gains are likely to be eminently worthwhile.

215

9.4.4.4 Numerical Algorithms The need for scalable algorithms in an exascale initiative has already been stressed. Another essential feature, highlighted in a 2009 talk by Kathy Yelick called Ten ways to waste a parallel computer, is a “back to basics” approach to reformulation. Without careful analysis of both new models and new numerical methods, there is the risk of significant inaccuracy or large computational overhead in unexpected parts of the overall solution process, as illustrated in the following two examples related to numerical methods for partial differential equations: 1. All indications are that memory will become the rate-limiting factor along the path to exascale, and investments should accordingly be made in designing algorithms with reduced memory requirements. Examples where this work is appropriate include: i.

ii. iii.

Algorithmically scalable matrix-free methods (e.g., multigrid) for sparse systems of equations, where “algorithmically scalable” means that the total resources needed to solve the problem (flops plus memory) are proportional to the resources needed to evaluate the associated operator; High-order methods that perform more computation to obtain greater accuracy for each computational degree of freedom; Adaptive models/methods designed to use the smallest possible number of degrees of freedom to obtain the needed level of accuracy.

2. Many calculations related to DOE missions involve models that depend on both space and time. In current methods, obtaining better spatial resolution typically requires a comparable reduction in the time step. A frequent argument for exascale science is that it will allow much finer spatial resolution in numerous application domains, with (for example) meshes reduced in size by a factor of ten. Unfortunately, simply reducing mesh spacing by a factor of ten could lead to a ten-fold increase in the time for solution, even with perfect weak scaling. Several strategies, all in the spirit of rethinking, should be explored to avoid this inefficiency. For example, models can be made more implicit to avoid restrictive time-step conditions arising from stiff processes that rapidly relax to equilibrium (e.g., in the context of low Mach-number fluid flows). A further strategy is aggressive use of sub-cycling in time for processes that are fast, but either are localized in physical space or involve only a small subset of the variables in state space. A motivating example here is advection in the jet stream in atmospheric modeling. Approaches of this flavor across the spectrum of numerical methods will almost certainly lead to increased algorithmic complexity, in addition to the daunting software-related challenges discussed. The substantially greater work needed to devise exascale numerical methods and software leads us to observe that, for decades, there has been, roughly speaking, a dichotomy in the wish list for the mathematical software used to solve scientific and engineering problems. On one hand, many DOE scientists have neither time nor inclination to become experts in numerical methods and software techniques, preferring to leave software development to mathematicians and computer scientists. On the other hand, some scientists and engineers want to become deeply involved in producing domain-specific methods and software to attain the highest possible efficiency for their particular problem. An exascale science program needs to address the needs of both these groups. For the first, “professional” mathematical software and libraries (meaning software developed by mathematicians and computer scientists for relatively generic problems such as solving linear systems or eigenvalue problems) should be developed for increasingly broad problem categories as we move toward exascale. In this way, domain scientists will be able to use state-of-the-art software components that can be shared across multiple application domains. Since writing software is universally recognized

216

to be time consuming and error-prone, scientists and engineers will benefit from availability of software that they can use off the shelf while experimenting with domain-specific challenges rather than writing their own sparse matrix package. For the second group, specific scientific case studies should be identified that require significant involvement of domain scientists, mathematicians, and computer scientists in end-to-end software development.

• Mathematics for massive data ➢ Machine learning ➢ Compressive sampling • Symbolic computing 9.4.4.5 The Algorithmic Challenges Advancing science in key areas requires development of next-generation physical models to satisfy the accuracy and fidelity needs for targeted simulations. The impact of these simulation fidelity needs on requirements for computational science is twofold. • •

First, more complex physical models must be developed to account for more aspects of the physical phenomena being modeled. Second, for the physical models being used, increases in resolution for key system variables, such as numbers of spatial zones, time steps or chemical species, are needed to improve simulation accuracy, which in turn places higher demands on computational hardware and software.

Application models represent the functional requirements that drive the need for certain numerical algorithms and software implementations. Science priorities lead to science models, and models are implemented in the form of algorithms. Algorithm selection is based on various criteria, such as appropriateness, accuracy, verification, convergence, performance, parallelism and scalability. Moving forward to exascale will put heavier demands on algorithms in at least two areas: • •

the need for increasing amounts of data locality in order to perform computations efficiently, the need to obtain much higher factors of fine-grained parallelism as high-end systems support increasing numbers of compute threads.

As a consequence, parallel algorithms must adapt to this environment, and new algorithms and implementations must be developed to extract the computational capabilities of the new hardware. Significant new model development, algorithm re-design and science application code reimplementation, supported by exascale-appropriate programming models, will be required to exploit efficiently the power of exascale architectures. The transition from current sub-petascale and petascale computing to exascale computing will be at least as disruptive as the transition from vector to parallel computing in the 1990’s. Uncertainty quantification will permeate the exascale science workload. The demand for predictive science results will drive the development of improved approaches for establishing levels of confidence in computational predictions. Both statistical techniques involving large ensemble calculations and other statistical analysis tools will have significantly different dynamic resource allocation requirements than in the past, and the significant code redesign required for the exascale will present an opportunity to embed uncertainty quantification techniques in exascale science applications. Some other points are: • •

New multicore-friendly and multicore-aware algorithms Adaptive Response to Load Imbalance

217

• • • • • • • • •

Multiple precision algorithms/software Communication avoiding Fast implicit solvers Auto-tuning Scheduling and memory management for heterogeneity and scale Fault tolerance and robustness for large-scale systems Building energy efficiency into algorithms foundations Sensitivity analysis Multiscale/multi-physics modeling

9.4.4.6 Computer Science Challenges The coming transition in computer architectures as peak capability approaches the exascale offers both challenges and opportunities282. The challenges involve a paradigm shift in programming methodologies. Existing technologies for writing parallel scientific applications have sustained HPC application software development for the past decade and have been successful for Petascale computing, but were architected for coarse-grained concurrency largely dominated by bulk synchronous algorithms. Future hardware constraints and growth in explicit on-chip parallelism will likely require a mass migration to new algorithms and software architecture that is as broad and disruptive as the migration from vector to parallel computing systems that occurred 15 years go. The applications and algorithms will need to rely increasingly on fine-grained parallelism, strong scaling, and fault resilience. Addressing these challenges opens up a renewed opportunity to introduce a higher level of software engineering into current fusion application subsystems that will enhance the modularity, portability, and performance of codes while extending their capabilities to new levels. At the same time, past sound investments must be protected, and a migration path from current to future environments must be elaborated. Some other themes are: • • • • •

Programming Models I/O Getting There from Here Tools Fault Tolerance

9.4.4.7 Educational Challenges Major challenges in exascale science include the building of understanding and awareness among groups with high prestige in both academia and industry, and the dearth of highly competent young scientists in this field, two issues that are not entirely unrelated. Many of the reasons for these problems are reasonably well understood, but not easily dealt with. Application scientists who focus primarily on building computational tools are sometimes regarded by their scientific community as not being “real” scientists. This phenomenon is particularly noticeable in both physics and chemistry, reflecting in part the penetration of “community codes”. From the opposite perspective, high-level software designers and programmers may not welcome or appreciate the contributions made by scientific disciplines to building state-of-the-art computational tools. On the bright side, interest in computational science and engineering worldwide has measurably increased during the past 15 years. Almost no universities, even those with faculty working on computational science and engineering, have, or are likely to develop, a curriculum that focuses on issues associated with exascale science. In The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 282

218

addition, as our subcommittee has noted already, many of the issues in exascale science are not yet understood, which means that a straightforward program of training in the usual sense is impossible. Exascale hardware and its features will keep changing, so that training people too early to think about specific hardware configurations is a bad idea. However, it is important to start soon to lay the foundations for future thinking about exascale science. To be successful, an exascale science education and training program needs to be devised and managed with creative flair, not business as usual283.

The Opportunities and Challenges of Exascale Computing, Summary Report of the Advanced Scientific Computing Advisory Committee (ASCAC) Subcommittee, Fall 2010, USDOE. 283

219

10 Artificial Intelligence in CFD 10.1 Background Artificial Intelligence (AI) is the broadest way to think about advanced, computer intelligence. In 1956 at the Dartmouth Artificial Intelligence Conference, the technology was described as such: "Every aspect of learning or any other feature of intelligence can in principle Machine Learning be so precisely described that a machine can be made to simulate it." AI can refer to anything from a computer Neural Networks program playing a game of chess, to a voice-recognition system like Amazon's Alexa interpreting and responding to speech. IBM's Deep Blue, which Deep Learning beat chess grand master Garry Kasparov at the game in 1996, or Google DeepMind's Alpha Go, are examples of AI. According to HackerEarth Blog, Figure 10.1 Scope of Artificial Intelligence (Courtesy of AI can be classified into the Hackerearth Blog) following (see Error! Reference s ource not found.):

Artificial Intelegence

• • •

Machine Learning Neural Networks Deep Learning

10.2 Machine Learning Before we pay tribute to the field of machine learning in CFD, it best to go briefly of what is machine learning itself. Machine learning is a type of Artificial Intelligence (AI) that provides computers with the ability to learn without being explicitly programmed. Machine learning focuses on the development of computer programs that can change when exposed to new data. In the past decade, machine learning has given us self-driving cars, practical speech recognition, effective web search, and a vastly improved understanding of the human genome. Machine learning is so pervasive today that you probably use it dozens of times a day without knowing it. The process of machine learning is similar to that of data mining. Both systems search through data to look for patterns. However, instead of extracting data for human comprehension as is the case in data mining applications machine learning uses that data to detect patterns in data and adjust program actions accordingly. Machine learning algorithms are often categorized as being supervised or un-supervised. Supervised algorithms can apply what has been learned in the past to new data. Unsupervised algorithms can draw inferences from datasets. Facebook's News Feed uses machine learning to personalize each member's feed. If a member frequently stops scrolling in order to read or "like" a particular friend's posts, the News Feed will

220

start to show more of that friend's activity earlier in the feed. Behind the scenes, the software is simply using statistical analysis and predictive analytics to identify patterns in the user's data and use to patterns to populate the News will be included in the data set and the News Feed will adjust accordingly. Google and Amazon are other heavy users of Machine Learning. 10.2.1 Difference Between Artificial Intelligence and Machine Learning Artificial Intelligence (AI) is a computer program that does something smart. It can be a pile of statements or a complex statistical model. Usually, when a computer program designed by AI researchers actually succeeds at something; like winning at chess many people say it's "not really intelligent", because the algorithms internals are well understood. So you could say that true AI is whatever computers can't do yet. Machine learning, as others here have said, is a subset of AI. In short, Machine learning is a science that involves development of self-learning algorithms. These algorithms are more generic in nature that it can be applied to various domain related problems. Machine learning uses statistics (mostly inferential statistics) to develop self-learning algorithms. Artificial Intelligence is a science to develop a system or software to mimic human to respond and behave in a circumstance. As field with extremely broad scope, AI has defined its goal into multiple chunks. Later each chuck has become a separate field of study to solve its problem. The "learning" part of Machine Learning means that ML algorithms attempt to optimize along a certain dimension; i.e. they usually try to minimize error or maximize the likelihood of their predictions being true. How does one minimize error? Well, one way is to build a framework that multiplies inputs in order to make guesses as to the inputs' nature. Different outputs/guesses are the product of the inputs and the algorithm. Usually, the initial guesses are quite wrong, and if you are lucky enough to have ground-truth labels pertaining to the input, you can measure how wrong your guesses are by contrasting them with the truth, and then use that error to modify your algorithm. That's what Artificial Neural Networks (ANN) do. They keep on measuring the error and modifying their parameters until they can't achieve any less error. They are, in short, an optimization algorithm. If you tune them right, they minimize their error by guessing and guessing and guessing again.

10.3 Deep Learning Artificial Neural Networks (ANN) are inspired by our understanding of the biology of our brains all those interconnections between the

Deep Learning

Machine Lerning

Artifical Inteligence

Figure 10.2

Schematics of AI, Machine Learning and Deep Learning

221

neurons284. But, unlike a biological brain where any neuron can connect to any other neuron within a certain physical distance, these artificial neural networks have discrete layers, connections, and directions of data propagation. You might, for example, take an image, chop it up into a bunch of tiles that are inputted into the first layer of the neural network. In the first layer individual neurons, then passes the data to a second layer. The second layer of neurons does its task, and so on, until the final layer and the final output is produced. Each neuron assigns a weighting to its input; how correct or incorrect it is relative to the task being performed. The final output is then determined by the total of those weightings. So think of our stop sign example. Attributes of a stop sign image are chopped up and “examined” by the neurons its octagonal shape, its fire-engine red color, its distinctive letters, its traffic-sign size, and its motion or lack thereof. The neural network’s task is to conclude whether this is a stop sign or not. It comes up with a “probability vector,” really a highly educated guess, based on the weighting. In our example the system might be 86% confident the image is a stop sign, 7% confident it’s a speed limit sign, and 5% it’s a kite stuck in a tree ,and so on and the network architecture then tells the neural network whether it is right or not. In short, Deep Learning is a technique for implementing Machine Learning. Deep Learning has enabled many practical applications of Machine Learning and by extension the overall field of AI as perceived in Figure 10.2. Deep Learning breaks down tasks in ways that makes all kinds of machine assists seem possible, even likely. Driverless cars, better preventive healthcare, even better movie recommendations, are all here today or on the horizon. Today, image recognition by machines trained via deep learning in some scenarios is better than humans, and that ranges from cats to identifying indicators for cancer in blood and tumors in MRI scans. Google’s AlphaGo learned the game, and trained for its Go match it tuned its neural network by playing against itself over and over and over.

10.4

Types of Problems and Tasks

Machine learning tasks are typically classified into three broad categories, depending on the nature of the learning "signal" or "feedback" available to a learning system. These are: 10.4.1 Supervised Learning How it works: This algorithm consist of a target / outcome variable (or dependent variable) which is to be predicted from a given set of predictors (independent variables). Using these set of variables, we generate a function that map inputs to desired outputs. The training process continues until the model achieves a desired level of accuracy on the training data. Examples of Supervised Learning: Regression, Decision Tree, Random Forest, KNN, Logistic Regression etc.285 10.4.2 Unsupervised Learning In this algorithm, we do not have any target or outcome variable to predict/estimate. It is used for clustering population in different groups, which is widely used for segmenting customers in different groups for specific intervention. Examples of Unsupervised Learning: Apriori algorithm, K-means. 10.4.3 Reinforcement Learning Using this algorithm, the machine is trained to make specific decisions. It works this way: the machine is exposed to an environment where it trains itself continually using trial and error. This machine learns from past experience and tries to capture the best possible knowledge to make accurate business decisions. Example of Reinforcement Learning: Markov Decision Process286. Michael Copeland, “What’s the Difference Between Artificial Intelligence, Machine Learning, and Deep Learning?”, July 2010. 285 Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015. 286 same as previous. 284

222

10.5

List of Common Machine Learning Algorithms

Here is the list of commonly used machine learning algorithms. These algorithms can be applied to almost any data problem where we explain a little bit of the first four. • • • • • • • • • • •

Linear Regression Logistic Regression Decision Tree Artificial Neural Networks (ANNs) Support Vector Machine (SVM) Naive Bayes K-Nearest Neighbors (KNN) K-Means Random Forest Dimensionality Reduction Algorithms Gradient Boost

10.5.1 Linear Regression It is used to estimate real values (cost of houses, number of calls, total sales etc.) based on continuous variable(s). Here, we establish relationship between independent and dependent variables by fitting a best line. This best fit line is known as regression line and represented by a linear equation Y = a ⋆ X + b. The best way to understand linear regression is to relive this experience of childhood. Let us say, you ask a child in fifth grade to arrange people in his class by increasing order of weight, without asking them their weights! What do you think the child will do? He / she would likely look (visually analyze) at the height and build of people and arrange them using a combination of these visible parameters. This is linear regression in real life! The child has actually figured out that height and build would be correlated to the weight by a relationship, Figure 10.3 Linear Regression which looks like the equation above. In this equation, Y-Dependent Variable, a-Slope, X-Independent variable and b-Intercept. These coefficients a and b are derived based on minimizing the sum of squared difference of distance between data points and regression line. Look at the below example. Here we have identified the best fit line having linear equation y = 0.2811 x+13.9 (see Figure 10.3). Now using this equation, we can find the weight, knowing the height of a person. Linear Regression is of mainly two types: Simple Linear Regression and Multiple Linear Regression. Simple Linear Regression is characterized by one independent variable. And, Multiple Linear Regression(as the name suggests) is characterized by

223

multiple (more than 1) independent variables. While finding best fit line, you can fit a polynomial or curvilinear regression. And these are known as polynomial or curvilinear regression287. 10.5.2 Logistic Regression Don’t get confused by its name! It is a classification not a regression algorithm. It is used to estimate discrete values ( Binary values like 0/1, yes/no, true/false ) based on given set of independent variable(s). In simple words, it predicts the probability of occurrence of an event by fitting data to a logit function. Hence, it is also known as logistic regression. Since, it predicts the probability, its output values lies between 0 and 1 (as expected). Again, let us try and understand this through a simple example. Let’s say your friend gives you a puzzle to solve. There are only 2 outcome scenarios ; either you solve it or you don’t. Now imagine, that you are being given wide range of puzzles/ quizzes in an attempt to understand which subjects you are good at. The outcome to this study would be something like this ; if you are given a trigonometry based tenth grade problem, you are 70% likely to solve it. On the other hand, if it is grade fifth history question, the probability of getting an answer is only 30%. This is what Logistic Regression provides you. Coming to the math, the log odds of the outcome is modeled as a linear combination of the predictor variables odds = p/ (1-p) = probability of event occurrence / probability of not event occurrence. ln(odds) = ln(p/(1-p)), logit(p) = ln(p/(1-p)). Above, p is the probability of presence of the characteristic of interest. It chooses parameters that maximize the likelihood of observing the sample values rather than that minimize the sum of squared errors (like in ordinary regression). Now, you may ask, why take a log? For the sake of simplicity, let’s just say that this is one of the best mathematical way to replicate a step function. It can go in more details, but that will beat the purpose of this article. 10.5.3 Decision Tree This is favorite algorithm and used it quite frequently. It is a type of supervised learning algorithm that is mostly set for classification problems288. Surprisingly, it works for both categorical and continuous dependent variables. In this algorithm, we split the population into two or more homogeneous sets. This is done based on most significant attributes/ independent variables to make as distinct groups as possible. In the image above, you can see that population is classified into four different groups based on multiple attributes to identify Figure 10.4 Decision Tree ‘if they will play or not’. To split the population into different heterogeneous groups, it uses various techniques (see Figure 10.4).

287 288

Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015. Sunil, Ray, “Essentials of Machine learning Algorithms (with Python and R codes)”, August 2015.

224

10.5.4 Artificial Neural Networks (ANNs) Computational model used in machine learning, computer science and other research disciplines, which is based on a large collection of connected simple units called artificial neurons, loosely analogous to axons in a biological brain. Connections between neurons carry an activation signal of varying strength. If the combined incoming signals are strong enough, the neuron becomes activated and the signal travels to other neurons connected to it. Such systems can be trained from examples, rather than explicitly programmed, and excel in areas where the solution or feature detection is difficult to express in a traditional computer program. Like other machine learning methods, neural networks have been used to solve a wide variety of tasks, like computer vision and speech recognition, that are difficult to solve using ordinary rule-based programming. Typically, neurons are connected in layers, and signals travel from the first (input), to the last (output) layer. Modern neural network projects typically have a few thousand to a few million neural units and millions of connections; their computing power is similar to a worm brain, several orders of magnitude simpler than a human brain. The signals and state of artificial neurons are real numbers, typically between 0 and 1. There may be a threshold function or limiting function on each connection and on the unit itself, such that the signal must surpass the limit before propagating. Back propagation is the use of forward stimulation to modify connection weights, and is sometimes done to train the network using known correct outputs. However, the success is unpredictable: after training, some systems are good at solving problems while others are not. Training typically requires several thousand cycles of interaction, (see Figure 10.5). The goal of the neural network is to solve problems in the same way that a human would, although several neural network categories are more abstract. New brain research often stimulates new patterns in neural networks. One new approach is use of connections which span further to connect processing layers rather than adjacent neurons. Other research being explored with the different types of signal over time that axons propagate, such as deep learning, interpolates greater complexity than a set of Boolean variables being simply on or off. Newer types of network are more free flowing in terms of stimulation and inhibition, with connections interacting in more chaotic and complex ways. Dynamic neural networks are the most advanced, in that they dynamically can, based on rules, form new connections and even new neural units while disabling others. Historically, the use of neural network models Figure 10.5 Artificial Neural Network (ANN) marked a directional shift in the late 1980s from high-level (symbolic) artificial intelligence, characterized by expert systems with knowledge embodied in if-then rules, to low-level (sub-symbolic) machine learning, characterized by knowledge embodied in the parameters of a cognitive model with some dynamical system. A simple example provided below demonstrates a better explanation. 10.5.4.1 Case Study - Prediction & Comparison of the Maximal Wall Shear Stress (MWSS) for Carotid Artery Bifurcation Steady state simulations for 1886 geometries were undertaken and MWSS values were calculated for each of them. This dataset was used for training and testing following data mining algorithms; k-

225

nearest neighbors, linear regression, neural network: multilayer perceptron, random forest and support vector machine. The results are based on Relative Root Mean Square (RMSE): n

RM SE =

 (f i =1 n

 (f i =1

i

− fî ) 2

i

− fi )

, 2

f i = desired value (target)   ˆ  f i = predicted value (predicted using dataminig algorithm) f = average value (average value of M WS for all 1886 samples) i 0  RM SE  1

Eq. 10.1 Visualization of the global importance of features used for modeling MWSS. The horizontal axis of each diagram denotes the values of particular Model RMSE feature and the vertical axis denotes the respective average contribution value for that particular K-Nearest Neighbors 0.760 feature value. The application of the model Linear Regression 0.748 explanation methodology results in quantitatively Neural Network 0.140 describing how much features and their individual Random Forest 1.127 values, on average, influence the target prediction values of the model. Visualization of the global Support Vector Machin 0.612 importance of features used for modeling MWSS. The horizontal axis of each diagram denotes the Table 10.1 Results of Different Methods values of particular feature and the vertical axis denotes the respective average contribution value for that particular feature value. The application of the model explanation methodology results in quantitatively describing how much features and their individual values, on average, influence the target prediction values of the model. (See

Figure 10.6 Maximal Wall Shear Stress (MWSS) Value for Carotid Artery Bifurcation

Table 10.1 and Figure 10.6).

226

10.6 Machine Learning in Fluid Dynamics Time-varying fluid flows are ubiquitous in modern engineering and in the life sciences. Particularly challenging is the characterization of unsteady aerodynamic forces and moments as they play critical roles in, for instance, biological propulsion and bio-inspired engineering design principals. It is observed that birds, bats, insects, and fish routinely harness unsteady fluid phenomena to improve their propulsive efficiency, maximize thrust and lift, and increase maneuverability. Such observations are highly suggestive, leading to conjectures about the existence of low-dimensional structures in fluid flows. Machine learning aims to capitalize on such dominant patterns of spatial-temporal activity. When integrated with more traditional first-principals simulation, reduced-order models can be developed to accurately quantify fluid flows evolution. 10.6.1 Motivation and Objectives Flow control has been a fundamental concept in fluid mechanics research in this century. We develop flow modeling and optimization techniques using biologically inspired algorithms such as Artificial Neural Networks (ANN) and evolution strategies. The applications presented herein encompass a variety of problems such as cylinder drag minimization, neural net modeling of the near wall structures, enhanced jet mixing, and parameter optimization in turbine blade film cooling. The unifying concept is the utilization of automated processes for the solution of these problems, devised from machine learning algorithms. The results presented herein encompass a wide variety of problems such as drag minimization, neural net modeling of the near wall structures, enhanced jet mixing, and parameter optimization in turbine blade film cooling. a challenging problem that, when solved, could lead to drastically improved designs. We envision neural network approaches as an effective way of developing such models and incorporating them in feedback control algorithms. In We present some preliminary results from the application of (ANN) as a method to construct low order models, describing the near wall dynamics in turbulent flows. Neural Networks are viewed as a general procedure of model formation encompassing schemes such as the Proper Orthogonal Decomposition (POD). 10.6.2 Design and Optimization Issue Another key issue in the effort to reduce time to market of new engineering designs is the optimization of the design parameters in an efficient manner. The design cycle usually involves multiobjective and multi-disciplinary optimization problems, requiring the iterative solution of empirical formulas, the appropriate integration of numerical simulations, and the incorporation of physical understanding of the various aspects of the problem. At the same time, the optimization cycle of the physical problem must take into consideration financial and manufacturing constraints. In flow related problems, this optimization cycle has benefited from advances in optimization theory which usually aim at tackling the most costly aspects of the optimization problem such as the solution of the Navier-Stokes equations. Powerful techniques such as the adjoint procedure have been implemented successfully in the design cycle of aircrafts. However, such optimization strategies are usually based on the efficient calculation of gradients of functions relating the quantity to be optimized to the parameters of the problem. Such gradients are not always readily available as often the optimization cycle would involve empirical formulas and cost functions that are difficult to express analytically in terms of the optimization problem. Moreover, gradient based algorithms are usually converging to local extrema. Therefore, the result strongly depends on the initial selection of parameters. Evolution strategies [Rechenberg]289 are optimization techniques that avoid the problems associated with the use of gradients as they require only the calculation of the cost function at each point in the Rechenberg, I., “Evolutions strategie: Optimierung technischer systeme nach prinzipien der biologischen evolution”. Fromann-Holzboog, Stuttgart, 1973. 289

227

parameter space. They operate based on natural principles of evolution such as mutation, recombination, and selection. These operations are adapted so that the algorithm automatically develops and attempts to optimize a model landscape relating the cost function to its parameters. Compared with gradient based techniques, their convergence rate is usually much lower, thus requiring large numbers of iterations that could be unrealistic for some problems of engineering interest. On the other hand, they are highly parallel algorithms that efficiently exploit today's powerful parallel computer architectures and they are more likely than gradient based algorithms to identify a global optimum. This latter aspect makes them attractive in many engineering applications where the fitness landscape cannot be assumed unimodal. 10.6.3 Accomplishments Data methods are certainly not new in the fluids community. Computational fluid dynamics has capitalized on Machine Learning efforts with dimensionality-reduction techniques such as proper orthogonal decomposition or dynamic mode decomposition, which compute interpretable low-rank modes and subspaces that characterize spatial-temporal flow data290. Proper Orthogonal Decomposition (POD) and Dynamic Mode Decomposition (DMD) are based on the singular value decomposition which is ubiquitous in the dimensionality reduction of physical systems. When coupled with Galerkin projection, POD reduction forms the mathematical basis of reduced-order modelling, which provides an enabling strategy for computing high-dimensional discretization of complex flows291. The success of dimensionality reduction in fluids is enabled by • •

Significant performance gains in computational speed and memory, Generation of physically interpretable spatial and/or spatial-temporal modes that dominate the physics.

Thus computations are enabled and critical physical intuition gained. Such success is tempered by two well-known failings of POD/DMD based reductions: • •

Their inability to capture transient, intermittent and/or multi-scale phenomenon without significant tuning, Their inability to capture invariances due to translation, rotation and/or scaling.

Holmes, P., Lumley, J. & Berkooz, G., ”Turbulence, Coherent Structures, Dynamical Systems and Symmetry”, Cambridge University Press, 1998. 291 Benner, P., Gugercin, S. & Willcox, K., “ A survey of projection-based model reduction methods for parametric dynamical systems”, SIAM Rev. 57, 483–531, 2015. 290

228

ANNs are almost diametrically opposed in their pros and cons. Specifically, ANNs are well suited for extracting multi-scale features as the ANN decomposition shares many similarities with wavelet decompositions, which are the computational work horse of multi-resolution analysis. Moreover, translations, rotations and other invariances are known to be easily handled in the ANN architecture. These performance gains are tempered by the tremendous computational cost of building a ANN from a large training set and the inability of ANN to produce easily interpretable physical modes and/or features. 10.6.4

Field Inversion and Machine Learning in Support of Data Driven Environment A machine learning technique such as an Artificial Neural Network (ANN) can adequately describe by its field inversion on data driven context. The Calibration Cases (offline data) where few configuration data (DNS or Experimental data) such as the one showing in Figure 10.7. The Prediction cases (Machine Learning with no data) has similar configuration with different; (1) Twist, (2) Sweep angles, and (3) Airfoil shape292. The challenge in predictive modeling, Figure 10.7 Calibration Cases for Off Line however, is to extract an optimal model form that is Data sufficiently accurate. Constructing such a model and demonstrating its predictive capabilities for a class of problems is the objective.

10.6.4.1 Artificial Neural Networks (ANNs)

The functional relationship b(η), where η = [η1; η2, , , , ηM]T are input features derived from meanfield variables that will be available during the predictive solution process. The functional relationship must be developed by considering the output of a number of inverse problems representative of the modeling deficiencies relevant to the predictive problem. Further, as explained below, elements of the feature vector η are chosen to be locally non-dimensional quantities. The standard NN algorithm operates by constructing linear combinations of inputs and transforming them through nonlinear activation functions293. The process is repeated once for each hidden layer (marked blue in Figure 10.8) in the network, until the output layer is reached. Figure 10.8 presents a sample ANN where a Network diagram for a feed-forward NN with three inputs, two hidden layers, and one output. For this sample network, the values of the hidden nodes z1,1 through z1,H1 would be constructed as

 3  z1,i = a1   w1i, jηi   i =1 

Eq. 10.2

where a1 and w1i,j are the activation function and weights associated with the first hidden layer, respectively. Similarly, the second layer of hidden nodes is constructed a

Heng Xiao, “Physics-Informed Machine Learning for Predictive Turbulence Modeling: Status, Perspectives, and Case Studies”, Machine Learning Technologies and Their Applications to Scientific and Engineering Domains Workshop, August 17, 2016. 293 Anand Pratap Singh, Shivaji Medida, Karthik Duraisamy, “Machine Learning-augmented Predictive Modeling of Turbulent Separated Flows over Airfoils”, Nov 2016. 292

229

 H1 2  z 2, j = a   w i, jz1,i   i =1  2

Eq. 10.3

Finally, the output is

 H2 3  y  f( ) = a   w i, jz 2,i   i =1  3

Eq. 10.4

Given training data, error back-propagation algorithms294 are used to find wnij . Once the weights are

Figure 10.8

Network Diagram for a feed-forward NN with three inputs and one output

found, computing the output depends only on the number of hidden nodes, and not on the volume of the training data. Hyper-parameters of the NN method include the number of hidden layers, the number of nodes in each hidden layer, and the forms of the activation functions. Typically, 3 layers and about 100 nodes were employed with a sigmoid activation function. 10.6.5 The POD as Linear Artificial Neural Network (LANN) A model reduction can be accomplished by projecting the model equations, i.e. the Navier-Stokes equations, on a properly selected lower dimensional phase subspace. A reasonable choice for a proper selection criterion for the base of this manifold is the maximization of the energy content of the projection295. This can be done by applying the Karhunen-Loeve decomposition to a data set that is representative of the dynamics of the system that we wish to approximate. This operation is called Proper Orthogonal Decomposition (POD)296. The linear POD is an approximation of the flow vector v by a finite expansion of orthonormal functions φn such that:

294 Zhang, Z. J. and Duraisamy, K., “Machine Learning Methods for Data-Driven Turbulence Modeling,” 22nd

AIAA Computational Fluid Dynamics Conference, AIAA Aviation, (AIAA 2015-2460), Dallas, TX, Jun 2015. 295 S. Müller , M. Milano and P. Koumoutsakos, “Application of machine learning algorithms to flow modeling and optimization”, Center for Turbulence Research Annual Research Briefs 1999. 296 Berkooz, G., Holmes, P. & Lumley, J. L. “The proper orthogonal decomposition in the analysis of turbulent flows”,Ann. Rev. Fluid Mech. 25, 539-575, 1993.

230

n

v = V +  a n (t)n (x) i =1

Eq. 10.5

where V is the time averaged flow, φn is the set of the first n eigenvectors of the covariance matrix C = E [(vi−V )(vj −V )]; when this representation for v is substituted in the Navier Stokes equations, the original PDE model is transformed in an ODE model, composed by n equations. The POD can be expressed as a multi-layer feed-forward neural network. Such a network is defined by the number of layers, the specification of the output function for the neurons in each layer, and the weight matrices for each layer. [Baldi and Hornik]297 have shown that training a linear neural network structure to perform an identity mapping on a set of vectors is equivalent to obtaining the POD of this set of vectors. A neural network performing the linear POD can be specified as a 2 layer linear network:

x = W1v

vˆ = W2 x

,

Eq. 10.6

where ^v is the reconstructed field, v is the original flow field, having N components, x is the reduced order representation of the field, having n components, and W1 and W2 are the network weight matrices, of sizes N x n and n x N respectively. Non-linearity can be introduced by a simple extension to this basic network:

x = W2 tanh(W1v)

,

vˆ = W4 tanh(W3 x)

Eq. 10.7

This corresponds to a neural network model with 4 layers: the first one, with an m x N weight matrix W1, nonlinear; the second one, with an n x m weight matrix W2, linear; the third one, also nonlinear, with an m x n weight matrix W3, and the last one, linear with an N x m weight matrix W4. However, the resulting system of ODEs is more involved as compared to the one resulting from the application of the linear POD.

297 Baldi, P. & Hornik, K., “ Neural networks and principal component analysis:

local minima”. Neural Networks. 2, 53-58, 1989.

Learning from examples without

231

10.6.5.1 POD and Nonlinear ANN A simple comparison of POD and nonlinear ANN is provided by the reconstruction of the velocity field in the stochastically forced Burger's equation a classical 1D model for turbulent flow [Chambers]298. The linear POD was used to obtain a set of 256 linear Eigen functions using 10000 snapshots extracted from a simulation. Using the first 7 Eigen functions it is possible to reconstruct the original flow field, keeping the 90 percent of the energy. A nonlinear neural network was trained on the same data set to perform the identity mapping: this network is composed by 256 inputs and 4 layers having respectively 64 nonlinear neurons, 7 linear neurons, 64 nonlinear neurons, and 256 linear neurons. For validation purposes, a data set of 1000 snapshots, not used in the training phase, was used. In Figure 10.9 it is possible to appreciate the reconstruction performances of both the Figure 10.9 Comparison of linear POD (top) and Neural Networks approaches; the proposed (bottom) nonlinear ANN clearly outperforms the linear POD (top) using a velocity field in Burgers equation. 10.6.6 Overview of ANNs in Turbulence Applications Turbulent flows generally exhibit multi-scale (spatial and temporal) physics that are high dimensional with rotational and translational intermittent structures also present. Such data provide an opportunity for ANN to make an impact in the modelling and analysis of turbulent flow fields. [Kurzawski & Templeton]299 have proposed using ANNs for Reynolds averaged Navier Stokes (RANS) models which are widely used because of their computational tractability in modelling the rich set of dynamics induced by turbulent flows. In this highlighted body of work, the specific aim is to use ANNs to build an improved representation of the Reynolds stress anisotropy tensor from highfidelity simulation data. Remarkably, despite the widespread success of ANNs at providing highquality predictions in complex problems, there have been only limited attempts to apply deep

Chambers, D. H., Adrian R. J., Moin, P. & Stewart, S.,”Karhunen-Loeve expansion of Burgers model of turbulence”. Phys Fluids. 31, 2573-2582, 1998. 299 Ling, J., Kurzawski, A. & Templeton, J. “Reynolds averaged turbulence modelling using deep neural networks with embedded invariance”. J. Fluid Mech 807, 155–166, 2016. 298

232

learning techniques to turbulence. Thus far, these attempts have been limited to a couple hidden layers. Figure 10.10 shows Skin Friction Coefficient for Onera M6 wing to be matched within 2%300.

True

Figure 10.10

Machine Learning

Skin Friction Coefficient for Onera M6 match to within 2%

10.6.7 The Future of ANNs for Fluids Modelling ANNs will almost certainly have a transformative impact on modelling high dimensional complex systems such as turbulent flows. The successes with many complex data sets will compel researchers to utilize this rapidly emerging data analysis tool for improving predictive capabilities. ANNs represent a paradigm shift for the community. Whereas many innovations have often been inspired from expert-in-the-loop intuition and physically interpretable models, ANNs have challenged these traditional notions by building prediction engines that simply outperform competing methods without providing clear evidence of why they are doing so. To some extent, the application of ANNs to turbulent flows will bring awareness to the fluids community of the two cultures of statistics and data science. These two outlooks are centered around the concepts of machine learning and statistical learning. The former focuses on prediction (ANNs) while the latter is concerned with inference of interpretable models from data (POD/DMD reductions). Although both methodologies have achieved significant success across many areas of big data analytics, the physical and engineering sciences have primarily focused on interpretable methods. Despite its successes, significant challenges remain for ANNs. Simple questions remains: 1. 2. 3. 4.

How many layers are necessary for a given data set? How many nodes at each layer are needed? How big must my data set be to properly train the network? What guarantees exist that the mathematical architecture can produce a good predictor of the data? 5. What is my uncertainty and/or statistical confidence in the ANN output? 6. Can I actually predict data well outside of my training data? 7. Can I guarantee that I am not overfitting my data with such a large network?

Karthik Duraisamy, “A Framework for Turbulence Modeling using Big Data”, NASA Aeronautics Research Mission Directorate (ARMD) LEARN/Seedling Technical Seminar January 13-15, 2015. 300

233

And the list goes on. These questions remain central to addressing the long-term viability of ANNs. The good news is that such topics are currently being intensely investigated by academic researchers and industry (Google, Facebook, etc.) alike. Undoubtedly, the next decade will witness significant progress in addressing these issues. From a practical standpoint, the work of determine the number of layers and nodes based upon prediction success, i.e. more layers and more nodes do not improve performance. Additionally, cross-validation is imperative to suppress overfitting. As a general rule, one should never trust results of a ANN unless rigorous cross-validation has been performed. Crossvalidation plays the same critical role as a convergence study of a numerical scheme. Given the computational maturity of ANNs and how readily available they are (see Google’s open source software called Tensor Flow), it is perhaps time for part of the turbulence modelling community to adopt what has become an important and highly successful part of the machine learning culture: challenge data sets. 10.6.8 Classification of Machine Learning (ML) Frameworks for Data-Driven Thermal Fluid Models as Envisioned by [Chang & Dinh]301 We focus on data-driven Thermal Fluid Simulation (TFS), specifically on their development using Machine Learning (ML). Five ML frameworks are introduced by [Chang and Dinh]302 including 1 2 3 4 5

Physics-Separated ML (PSML or Type-I ), Physics-Evaluated ML (PEML or Type-II), Physics-Integrated ML (PIML or Type-III), Physics-Recovered ML (PRML or Type-IV), Physics-Discovered ML (PDML or Type-V).

The frameworks vary in their performance for different applications depending on the level of knowledge of governing physics, source, type, amount and quality of available data for training. Notably, outlined for the first time in this investigation, Type-III models present stringent requirements on modeling, substantial computing resources for training, and high potential in extracting value from “big data” in thermal fluid research. The current investigation demonstrates and explores ML frameworks example such as the heat diffusion equation with a nonlinear conductivity model, formulated by convolutional neural networks (CNNs) and feedforward neural networks (FNNs). To illustrate the applications of Type-I, Type-II, Type-III, and Type-V ML. The results indicate a preference for Type-II ML under deficient data support. Type-III ML can effectively utilize field data, potentially generating more robust predictions than Type-I and Type-II ML. CNNbased closures exhibit more predictability than FNN-based closures, but CNN-based closures require more training data to obtain accurate predictions. Second, we illustrate how to employ Type-I ML and Type-II ML frameworks for data-driven turbulence modeling using reference works. Third, we demonstrate Type-I ML by building a deep FNN-based slip closure for two-phase flow modeling. The results show that deep FNN-based closures exhibit a bounded error in the prediction domain. 10.6.8.1 Machine Learning (ML) for Thermal Fluid Simulation Machine learning (ML) can be used to develop closure models by learning from the available, relevant, and adequately evaluated data303 with nonparametric models. While the concept of ML is

Chih-Wei Chang and Nam T. Dinh, “Classification of Machine Learning Frameworks for Data-Driven Thermal Fluid Models”, North Carolina State University, Raleigh NC 27695-7909. 302 Chih-Wei Chang and Nam T. Dinh, “Classification of Machine Learning Frameworks for Data-Driven Thermal Fluid Models”, North Carolina State University, Raleigh NC 27695-7909. 303 Thermal fluid simulations involve conservation equations with various degrees of averaging from the first principle based on distinct hypotheses. The underlying physics of the conservation equations should be 301 301

234

not new, the past decade has witnessed a significant growth of capability and interest in machine learning thanking advances in algorithms, computing power, affordable memory, and abundance of data. There is a wide range of applications of machine learning in different areas of engineering practice. In a narrow context of the present study, the machine learning is defined as the capability to create effective surrogates for a massive amount of data from measurements and simulations. Figure 10.11 depicts a workflow of employing ML for developing thermal fluid closures. The objective is to construct a function to represent the unknown model that correlates inputs and targets. Since the supervised learning is interested, inputs and targets are essential that can be obtained from ARAED. The X denotes the flow feature space as inputs. The Y presents the response space as targets that are associated with flow features. The subscript k denotes the kth measurement at a certain location. After collecting all relevant datasets, ML models are generalized by a set of nonlinear functions with hyper parameters to represent a thermal fluid closure. Based on different ML methods, various algorithms are employed to seek an optimal solution that allows a ML-based model to fit the observed data. Based on distinct learning purposes, [Domingos]304 classified ML methods into five tribes including symbolists, evolutionary, analogizes, connectionists, and Bayesians. [Ling & Templeton]305 evaluated the predictability of various ML algorithms for predicting the averaged Navier-Stoke uncertainty in a high Reynolds region.

ML-based Thermal fluid closures ML (X) ≈ Y Inputs X = {x1,…,xn}, k =1,2,…,n

Targets Y = {y1,…,yn}, k =1,2,…,n

Machine Learning

Figure 10.11

Workflow of Employing ML methods for Developing Thermal fluid closures – (Courtesy of Chang & Dinh)

consistent with the experiment or simulation where the Available, Relevant, and Adequately Evaluated Data (ARAED) are obtained. 304 Domingos P., The Master Algorithm, Basic Books, 2015. 305 Ling J., Templeton J., Evaluation of machine learning algorithms for prediction of regions of high Reynolds averaged Navier Stokes uncertainty, Physics of Fluids, 2015.

235

10.6.8.2 Thermal Fluid Data Figure 10.12 provides an overall characterization of thermal fluid data by data type, data source, and data quality. The global data are system conditions and integrated variables such as system pressure, mass flow rate, pressure drop, and total heat input. The local data are time series data at specific locations. The field data are measurements of field variables resolved in space and in time. Traditionally, experiments are a primary source of data, including so-called integral effect tests (IETs) and separate effect tests (SETs). As the name suggests, SETs and IETs are designed to investigate isolated phenomena and complex (tightly coupled) phenomena, respectively. Increasingly, appropriately validated numerical simulations become a credible source of data. This includes high-fidelity numerical simulations (e.g., DNS, and other CFD methods), as well as systemlevel simulation using computer models in parameter domains that are extensively calibrated and validated. It is noted that datasets vary by their quality regarding the quantity and uncertainty. The amount of data affects the performance of inverse modeling since sufficient data can reduce the model parameter uncertainty in the domain of interest. Within a narrow context of ML for thermal fluid simulation, the data quality can be characterized by the amount of relevant and adequately evaluated data (i.e., data quantity) and associated uncertainty (including measurement uncertainty and other biases, e.g., scaling, processing).

Global Data Type

Local Data Field Data IET

Thermal Fluid Data

Experiment Source

SET Simulation Quantity

Quality Uncertainty Figure 10.12

Hierarchy of Thermal Fluid Data - (Courtesy of Chang & Dinh)

10.6.8.3 Machine Learning Frameworks for Data-Driven Thermal Fluid Models As evident from the preceding discussion, the time is ripe for applying ML methods to assist developments of data-driven thermal fluid models. There is a large variety of ways to embed ML in the models. The three major factors are knowledge of physics (overall model form), available data, and machine learning techniques. It is necessary to incorporate knowledge of underlying physics whenever such knowledge is available and trustworthy into ML-based models. The benefits of so

236

doing are wide-ranging, from preventing models from generating unphysical results to narrowing the search space by reducing the dimension of problems. Here, the focus is placed on the use of neural networks, specifically deep learning (or multilayer neural networks), which has recently emerged as capable and universal approximate. Notably, their hierarchical structures deem appropriate for describing complex models that involve multiple scales. The objectives is to develop a system to characterize different approaches to use ML to aid developments of data-driven models in thermalfluid simulation to help navigate an inter-disciplinary domain (of thermal-fluid simulation, datadriven modeling, and deep learning). The technical approach stems from literature analysis, implementation, and investigation of major types of ML frameworks on synthetic examples. 10.6.8.4 Criteria for Classifying ML Frameworks for Thermal Fluid Simulation Each framework has its distinct goal and approach to leverage data. Since we classify five frameworks, we build the classification system based on four conditions. First, we examine whether solutions are converged meaning that solutions conserve the mass-momentum-energy balance in a control volume. Second, we check if the framework focuses on developing fluid closures. Third, we distinguish Type-III ML from other frameworks because it inherently ensures data-model consistency. Finally, the last condition is about the separation of scales. Accounting for all four conditions, we categorize five distinct types of ML frameworks for thermal fluid simulation based on the following four criteria: 10.6.8.4.1 Criterion 1- Is PDE Involved in Thermal Fluid Simulation? The first criterion examines whether conservation equations are involved in thermal fluid simulation. Type-V ML relies on ML to discover the underlying physics directly from data and to deliver equivalent surrogates of governing equations. Type-V ML is an extreme case when there is no prior knowledge, and we must purely depend on the observed data. By this criterion, we can distinguish Type-V ML from other four ML frameworks. 10.6.8.4.2 Criterion 2 - Is the Form of PDEs Given? The second criterion inspects if the form of conservation models is known. Type-IV ML does not make biases on selecting physics models; instead, it recovers the exact form of conservation models based on data. Therefore, we can distinguish Type-IV ML from Type-I, Type-II, and Type-III ML. 10.6.8.4.3 Criterion 3 - Is the PDE Involved in the Training of Closure Relations? PDEs are involved in Type-I, Type-II, and Type-III ML. Therefore, the goal is to develop closure models in nonparametric forms to close conservation equations. Criterion 3 checks whether conservation equations are involved in the training of ML-based closures. Traditionally, the assumptions of scale separation and physics decomposition are essential to develop closure models. Classification Is PDE involved in thermal fluid simulation? Is the form of PDEs given? Is the PDE involved in the training of closure relations? Is a scale separation assumption required for the model development? Table 10.2

Type I

Type II

Type III

Type IV

Type V

Yes

Yes

Yes

Yes

No

Yes

Yes

Yes

No

No

No

No

Yes

No

No

Yes

No

No

No

No

Criteria for the ML Framework Classification - (Courtesy of Chang & Dinh)

237

The former allows us to set up SETs for various scales while the latter decomposes closure relations into different physics within the same scale. However, in many thermal fluid processes, the physics (physical mechanisms) is tightly coupled. Type-III ML avoids these two assumptions by training closure models that are embedded in PDEs. By this criterion, we can distinguish Type III ML from Type I and Type II ML. 10.6.8.4.4 Criterion 4 - Is a Scale Separation Assumption Required for the Model Development? This criterion tests whether the model development requires the separation of scales. This hypothesis isolates closure relations from conservation equations so that the models can be separately built and calibrated by SETs. The scale separation is essential for Type-I ML because it only relies on data to construct closure models. However, the data by SETs may have been distorted, while IETs are designed to capture (a selected set of) multi-physics phenomena. Table 10.2 summarizes the criteria to classify the five distinct types of ML frameworks for thermal fluid simulation. 10.6.8.5 Type-I : Physics-Separated Machine Learning (PSML) Type-I ML or so-called physics-separated ML (PSML) aims at developing closure models by using SET data. Type-I ML assumes that conservation equations and closure relations are scale separable, for which the models are local. Type-I ML requires a thorough understanding of the system so that SETs can be designed to support model developments. We can apply ML-based closures to assimilate data to achieve data-driven thermal fluid simulation. Figure 10.13 depicts the architecture of Type-I ML framework, and it is forward data-driven modeling. The procedure includes the following elements: 10.6.8.5.1 Element 1 Assume a scale separation is achievable such that closure models can be built from SETs. From either high-fidelity simulations or experiments, collect training data, (xk, yk). 10.6.8.5.2 Element 2 Preprocess data from element 1 to ensure that data from multi-sources have the same dimension and manipulation such as the selection of averaging methods. Additionally, consider normalizing data so that we can approximately equalize the importance for each data source. For large datasets, employ principal component analysis can be helpful to reduce the dimension of data. 10.6.8.5.3 Element 3 Compute flow features or system characteristics, X, as training inputs for element 5. 10.6.8.5.4 Element 4 Calculate the corresponding outputs (Y) of the desired closures from data as training targets that can supervise ML algorithms to learn from data. 10.6.8.5.5 Element 5 Utilize ML algorithms to build a correlation between inputs and targets. After the training, output the ML-based closure model, ML(X), to element 6. 10.6.8.5.6 Element 6 Constrain the ML-based closure, g(ML(X)), to satisfy model assumptions and to ensure the smoothness of model outputs since it needs to be solved with PDEs. It is noted that this element is not essential if assumptions are not applicable. 10.6.8.5.7 Element 7 Implement the ML-based closure into conservation equations, and solve PDEs for predictions with the embedded ML-based closure that is iteratively queried.

238

Type-I ML satisfies the criteria from Table 10.2 except the third criterion. The quality of SET data largely controls the performance of closure models obtained by Type-I ML. While the experimental uncertainty in each SET may be controlled and reduced, the process uncertainty (dominated by design assumptions) is irreducible. We refer that PDEs and closure relations are decoupled in TypeI ML. It can cause model biases between conservation equations and closure relations. It is noted that inferring model parameters from data belong to inverse problems which are ill-posed [34]. For ML models, a small change in inputs can result in large uncertainty in outputs. While implementing ML based closures in PDEs, the uncertainty can lead to a discontinuity that fails numerical simulation. For more practices related to Type-I ML, readers are referred to [Ma et al.]306-307, [Parish & Duraisamy]308, [Zhang & Duraisamy]309, among numerous others.

Figure 10.13

Overview of Type I ML Framework with a Scale Separation Assumption - (Courtesy of Chang & Dinh)

Ma M., Lu J., Tryggvason G., Using statistical learning to close two-fluid multiphase flow equations for a simple bubbly system, Physics of Fluids, 27 (2015). 307 Tryggvason G., Ma M., Lu J., DNS–Assisted Modeling of Bubbly Flows in Vertical Channels, Nuclear Science and Engineering, 184 (2016) 312-320. 308 Parish E.J., Duraisamy K., A paradigm for data-driven predictive modeling using field inversion and machine learning, Journal of Computational Physics, 305 (2016) 758-774. 309 Zhang Z.J., Duraisamy K., “Machine Learning Methods for Data-Driven Turbulence Modeling”, American Institute of Aeronautics and Astronautics, 2015. 306

239

10.6.8.6 Type-II: Physics-Evaluated Machine Learning (PEML) Type-II ML or so-called physics-evaluated machine learning (PEML) focuses on reducing the uncertainty for conservation equations. It requires prior knowledge on selecting closure models to predict thermal fluid behaviors. Type-II ML utilizes high-fidelity data to inform low-fidelity simulation. Comparing to high-fidelity models, ROMs can efficiently solve engineering design problems within an affordable time frame. However, ROMs may produce significant uncertainty in predictions. Type-II ML can improve the uncertainty of low-fidelity simulation by reference data. Since the physics of thermal fluids is nonlinear, ML algorithms are employed to capture the underlying correlation behind high-dimensional data. The framework requires training inputs such as flow features that represent the mean flow properties. Training targets are the responses that correspond to input flow features. Type-II ML satisfies the first two criteria in Figure 10.12. We refer that PDEs and closure relations are loosely coupled in Type-II ML because PDEs are only used for calculating input flow features. The framework provides a one-step solution to improve lowfidelity simulation. Model uncertainty is not accumulated in Type-II ML because numerical solvers do not interact with ML models. However, Type-II ML exists an open question about what the magnitude of initial errors can be before it is too late to bring a prior solution to a reference solution. For more detailed examples of Type-II ML, readers are referred to [Ling & Templeton]310, [Ling, et al.}311, [Wu et al.]312, and [Zhu & Dinh]313. 10.6.8.7 Type III - Physics-Integrated Machine Learning (PIML) To the best knowledge of the authors, Type-III ML or so-called physics-integrated ML (PIML) is introduced and developed for the first time in this work. Type-III ML aims at developing closure relations to close thermal fluid models without a scale separation assumption. Closure models are embedded and trained in system dynamics. Training data can be obtained from SETs and IETs. Notably, Type-III ML can lead the paradigm shift of using ML in thermal fluid simulation because it allows the direct use of field data from IETs. Inputs for Type-III ML do not directly come from observations; instead, they are solutions of PDEs. Type-III ML satisfies most criteria in Table 10.1 except for the fourth criterion. We refer that PDEs and closure relations are tightly coupled in TypeIII ML. It is a challenging problem. Such tightly coupled multiscale problems require that numerical solutions (of the governing PDE system) are realized (hence evolving datasets for training) whenever ML algorithms tune model parameters. Therefore, Type-III ML is computationally expensive. The research on Type-III ML methodology promises a high-potential impact in complex thermal fluid problems where the separation of scales or physics decomposition may involve significant errors. 10.6.8.8 Type IV - Physics-Recovered Machine Learning (PRML) Type-IV ML or so-called physics-recovered ML (PRML) aims at recovering the exact form of PDEs. It requires no assumption about the form of governing equations. Instead, the framework requires to construct a candidate library that includes components of governing equations such as time derivative, advection, diffusion, and higher order terms. Type-IV ML only satisfies the first criterion in Table 10.1. The challenge of Type-IV ML can be the recovery of closure relations in thermal fluid models. Closure models are usually complex, and they are hard to be represented by each derivative term. Therefore, it is an open question about how to apply Type-IV ML for complex flow system such Ling J., Templeton J., Evaluation of machine learning algorithms for prediction of regions of high Reynolds averaged Navier Stokes uncertainty, Physics of Fluids, 27 (2015) 085103. 311 Ling J., Jones R., Templeton J., Machine learning strategies for systems with invariance properties, Journal of Computational Physics, 318 (2016) 22-35. 312 Wu J.-L., Wang J.-X., Xiao H., Ling J., Physics-informed machine learning for predictive turbulence modeling: A priori assessment of prediction confidence, (2016). 313 Zhu Y., Dinh N.T., A Data-Driven Approach for Turbulence Modeling, in: NURETH-17, American Nuclear Society, Xi'an, China, 2017. 310

240

as turbulence modeling. For more practices related to Type-IV ML, readers are referred to [Brunton et al.]314. 10.6.8.9 Type V - Physics-Discovered Machine Learning (PDML) Type-V ML or so-called physics-discovered ML (PDML) is the extreme case. Type-V ML is used for either condition. First, it assumes no prior knowledge of physics. Second, it assumes existing models and modeling tools are not trustworthy or not applicable for thermal fluid systems under consideration. More generally, Type-V ML is “equation-free” and instrumental in the search for a new modeling paradigm for complex thermal-fluid systems. Type-V ML does not involve conservation equations nor satisfy any criterion in Table 10.1. Instead, it wholly relies on data to discover the effective predictive models. However, such situation rarely occurs because there are usually physics principles or hypotheses that can be postulated to reduce the dimension of problems. For the discussion related to Type-V ML, readers are referred to [Mills et al. ]315 and [Hanna et al.]316. 10.6.8.10 Knowledge and Data Requirements for ML Frameworks in Thermal Fluid Simulation In the present context of ML, knowledge refers to a body of theoretical and empirical evidence that is available and trustworthy for understanding and description of physical mechanisms that underlie thermal fluid processes under consideration. This knowledge can guide selecting model forms, including conservation equations and corresponding closure relations, designing experiments, and performing high-fidelity simulations. The data requirements refer to characteristics of the body of data (e.g., types, amount, quality) needed to enable thermal fluid simulation with the required accuracy. In other words, the required data must be sufficient to complement the “knowledge” for building closure models and recovering/discovering the physics. The form of PDEs are known for Type I, Type II, Type III ML, and the focus is to build closure relations. In traditional modeling approaches, closure models are local, relating a group of (local) source terms (i.e., sub-grid-scale interactions) to a group of (local) flow features. Even when in engineering literature, source terms are expressed regarding global parameters (like flow rate, system pressure), they are used as surrogates for local-valued parameters (through the assumptions that equate global and local conditions). Type - I ML build closure relations independently from PDEs, but it requires a thorough or assumed understanding of the physics that is Figure 10.14 Domain of Various ML Frameworks where essential to set up SETs for acquiring L, M, and H Denote Low, Medium, and High - (Courtesy of data. Globally measured data or locally Chang & Dinh) measured data (using point Brunton S.L., Proctor J.L., Kutz J.N., Discovering governing equations from data by sparse identification of nonlinear dynamical systems, Proceedings of the National Academy of Sciences, 113 (2016) 3932–3937. 315 Mills K., Spanner M., Tamblyn I., Deep learning and the Schrödinger equation, (2017). 316 Hanna B.N., Dinh N.T., Youngblood R.W., Bolotnov I.A., Coarse-Grid Computational Fluid Dynamics (CG-CFD) Error Prediction using Machine Learning, under review, (2017). 314

241

instruments) are very small amount of data. In such case, complicated ML-based closures are not necessarily the best choice. Therefore, among the frameworks, Type - I ML exhibits a minimal data requirement with a maximal knowledge requirement. Type-II ML assumes prior knowledge of physics that guide the selection of closure relations for thermal fluid simulation. However, the use of prior models yields uncertainty in thermal fluid analyses. This uncertainty (or error) can be inferred by comparing the model prediction to reference solutions from high-fidelity simulations, high-resolution experiments as well as data obtained in IETs that include multi-physics phenomena. Correspondingly, Type-II ML requires larger data quantities but less knowledge than Type-I ML. Type-III ML trains closure relations that are embedded in conservation equations without invoking a scale separation assumption. IET data can be directly adapted into simulation by applying Type-III ML. While the term ML is broad, in the present work ML refers to the use of non-parametric models or even narrower, use of DNNs. This means no prior knowledge of model forms of closure relations. Thus, Type-III ML requires less knowledge than Type-II ML (which “best-estimated” closure models on the basis of past data). Consequently, Type-III ML requires a large body of data to represent models than that of Type-II ML. Type-IV ML intends not to make any bias on selecting conservation equations; instead, it recovers the exact PDE form from data. It assumes less prior knowledge but requires more extensive training data than the previous three frameworks. Type-V ML is an extreme case that makes no assumption about prior knowledge or reference solutions for thermal fluid systems under consideration. The aim is to apply ML methods to learn from data, and to establish a data-driven predictive capability. For thermal fluid simulation, it means discovering the effective model form of conservation equations and closure relations. Accordingly, among the frameworks, Type-V ML is the most stringent with respect to data requirements (types, quantity, and quality). Figure 10.14 depicts the domain of ML frameworks regarding prior knowledge and data requirements. 10.6.8.11 Case Study - Heat Conduction Investigation by Type I ML Framework The heat conduction case study is formulated to demonstrate how to employ Type I, Type II, and Type III ML to build ML-based thermal conductivity and to compare results by each framework. Chanda et al. used ANN with genetic algorithm317 to solve inverse modeling for heat conduction problems. In this work, Deep Learning (DL)318 is selected as the ML methodology in this task. Principally, any neural network (NN) with more than two layers (one hidden layer with an output layer) is considered as to be [DL]319. [Hornik]320 proved that multilayer NNs are universal approximators, and it can capture the properties of any measurable information. This capability makes DL attractive for the closure development in thermal fluid simulation. Notably, we implement NN-based thermal conductivity by FNNs and convolutional neural networks (CNNs) to evaluate the performance of closure relations by distinct NNs. 10.6.8.11.1 Problem Formulation We formulate the synthetic task using a 2D (two-dimensional) heat conduction model given by Eq. 10.8 where k(T) is nonlinear thermal conductivity. To generate training data, Eq. 10.8 shows a temperature-dependent model for k(T) where c, σ, and μ are constant parameters. Table 10.3 gives

Hanna B.N., Dinh N.T., Youngblood R.W., Bolotnov I.A., Coarse-Grid Computational Fluid Dynamics (CG-CFD) Error Prediction using Machine Learning, under review, (2017). 318 LeCun Y., Bengio Y., Hinton G., Deep learning, Nature, 521 (2015) 436-444. 319 Heaton J., Artificial Intelligence for Humans, Volume 3: Deep Learning and Neural Networks, Heaton Research, Inc., Chesterfield, MO, 2015. 320 Hornik K., Stinchcombe M., White H., Multilayer Feedforward Networks are Universal Approximators, Neural Networks, 2 (1989) 359-366 317

242

two parameter sets (baseline and prior sets) to generate data. While demonstrating ML frameworks, k(T) becomes NN-based thermal conductivity.

∂ ∂T ∂ ∂T [k(t) ] + [k(t) ] = 0 ∂x ∂x ∂y ∂y

,

k(t) =

Eq. 10.8 Data Set Baseline set for producing synthetic data Prior set for producing inputs required by Type II ML Table 10.3

c σ√2π

c (W/m) 7.2x104 7.2x104

(T−μ)2 e 2σ2

σ (K) 300 600

μ (K) 1200 2100

Parameter Sets for the Thermal Conductivity Model - (Courtesy of Chang & Dinh)

Two numerical experiments are designed to emulate IETs and SETs for manufacturing synthetic data by solving Eq. 10.8 using parameters sets in Table 10.3. IETs provide field data, for instance, 2D temperature fields by an infrared camera. SETs offer global data such as a 1D measurement by thermocouples. Synthetic data are used for training and validating NN-based thermal conductivity. Type I ML can only use SET data because of a scale separation assumption. Type II ML can only use SET data because the goal is to improve the prior thermal conductivity by the baseline. Type III and Type V ML use field data. We compare Type I and Type II ML using training data from SETs. Then Type III and Type V ML are compared by field data from IETs. 10.6.8.11.2 Manufacturing IET Data IETs are measurements of temperature fields. Synthetic IET data are generated by Eq. 10.8 with the baseline set in Table 10.3. Figure 10.15 illustrates the layout of IET experiments with four constant boundary temperatures. We change Twest for various observations and fix the boundary temperature (1300K) at the east side. The Tnorth and Tsouth are linearly dependent on the west boundary condition. We prepare three training datasets by including distinct data quantities and three validating datasets by changing Twest. Table 10.4 gives the metadata of each training or validating dataset. All observations are uniformly sampled within a given temperature range. Data Set T1 T2 T3 P1 P2 P3

Data Quantity 11 observations 100 observations 1000 observations 1000 observations 1000 observations 1000 observations

Table 10.4

Figure 10.15 Schematic of integral effects tests (IETs) for measuring Temperature fields - (Courtesy of Chang & Dinh)

Temperature Range at Twest [1000K, 1100K] [1000K, 1100K] [1000K, 1100K] [1000K, 1100K] [900K, 1000K] [800K, 900K]

Description Training data set Training data set Training data set Validating data set Validating data set Validating data set

Summary of IET Training and Validating Data Sets - (Courtesy of Chang & Dinh)

243

10.6.8.11.3 Manufacturing SET Data SETs are global measurements by thermocouples. Figure 10.16 depicts the layout of SETs for obtaining mean temperature and heat conductivity data. A heater is on top of the sample to maintain a constant temperature (TH). Thermal insulations are installed on the outside surface. The coolant at the bottom removes the heat with a constant heat transfer coefficient. Eq. 10.9 calculates temperature profiles within the sample using parameter sets in Table 10.3. Eq. 10.9 also calculates the observed heat conductivity (kobs), and the mean temperature is obtained by arithmetic averaging TH and TC.

∂ ∂T [k(t) ] = 0 ∂x ∂x

Figure 10.16 Schematic of Separate Effects Tests (SETs) for Measuring Thermal Conductivity as the Function of Sample’s Mean Temperature - (Courtesy of Chang & Dinh)

,

k obs

TH − TC = h(TC − Tcoolant ) H

Eq. 10.9 We generate two training datasets with two coolant temperatures to explore the effect by different data qualities. Error! Reference source not found. shows the metadata of SET datasets. A large t emperature gradient across the testing sample increases the nonlinearity of temperature profiles. For each training set, we uniformly sample 41 TH from Eq. 10.10 to keep mean temperatures in SETs within the same range as IETs.

Eq. 10.10

(HH,max , HH,min ) = (2TIET,max − Tcoolant , 2TIET,min − Tcoolant )

Data Set S1 S2

Data Quantity 41 Observations 41 Observations

Table 10.5

Data Quality Low High

Tcoolant(K) 800 900

Description Training dataset Training dataset

Summary of SET Training Datasets - (Courtesy of Chang & Dinh)

10.6.8.11.4 Implementation of the Heat Conduction by Type I ML Frameworks We present Type-I ML in Algorithm 1. SET data are generated by Eq. 10.9 with the baseline set in Table 10.3. Inputs and targets are temperatures and thermal conductivities. After the training, FNNbased thermal conductivity is implemented in Eq. 10.8 for predictions.

244

Algorithm 1 - Type I ML for 2D Heat Conduction Problem with Dirichlet BC. Input Training Input (Tbaseline, elemnt 3 in Figure 10.13,), and training targets (kbaseline, element 4 in Figure 10.13) from SETs (element 1 in Figure 10.13) Output Temperature fields for predictions (element 7 in Figure 10.13) 1 for all epochs < maximum_epoch do (element 5 in Figure 10.13) 2 // Build a conductivity model using FNNs 3 K(T) ← FNN(T) 4 for all inputs (T) baseline , kbaseline Tk) ∈ training datasets do 5 Update hyperparameters for each layer in FNNs 6 Implement ()kT into Eq. (1) (element 7 in Figure 10.13) 7 Solve Eq. 10.8 with Dirichlet boundaries for predictions (element 7 in Figure 10.13)

For additional information, or how to construct the type II, III, IV, V algorithm, please consult the work by [Chang and Dinh]321 .

Figure 10.17

Architecture of CNN-based thermal conductivity (adopted after LeCun)

10.6.8.11.5 CNN-Based Thermal Conductivity Model Figure 10.17 depicts the architecture322 of CNN-based thermal conductivity that includes three convolutional layers and three fully connected layers. We use the [ReLU]323 activation for layers in CNNs to accelerate the training. Inputs are temperature fields. After the first convolutional layer, eight feature maps are generated, and each feature map detects the patterns from temperature fields. The second convolutional layer takes inputs from the previous layer, and it outputs 12 feature maps. The third convolutional layer receives inputs from the previous layer, and it delivers 24 feature maps to fully connected layers. Finally, we obtain thermal conductivity fields from CNN’s outputs. Learning is an optimization process, and we need to define a cost function based on distinct types of data to inform ML algorithms to tune NN hyper parameters. Eq. 10.11 defines the cost function where N, yi,data, and yi,model are the total number of training data, ith training data, and ith model solution. To prevent overfitting, we add a regularization term in Eq. 10.11 where i, and NL denote the ith layer Chih-Wei Chang and Nam T. Dinh, “Classification of Machine Learning Frameworks for Data-Driven Thermal Fluid Models”, North Carolina State University, Raleigh NC 27695-7909. 322 Lecun Y., Bottou L., Bengio Y., Haffner P., Gradient-based learning applied to document recognition, Proceedings of the IEEE, 86 (1998) 2278-2324. 323 Nair V., Hinton G.E., Rectified linear units improve restricted Boltzmann machines, 2010, pp. 807–814. 321

245

and total layer number. λ is the regularization strength, and W is the matrix of total weights in ith layer. We implement NN-based thermal conductivity using Tensor flow324 which is the DL framework developed by Google. Weights and biases of NNs are tuned based on data using [Adam]325 algorithm. N

NL

i=1

i=1

1 2 E= ∑(yi,model − yi,data ) + ∑ λi ‖Wi ‖2 2N Eq. 10.11

10.6.8.11.6 Closing Remarks The present study is motivated by the growing interest and development of machine learning models in thermal fluid simulation. The trend is powered by the advent of data-intensive research methods, such as modern thermo-fluid experiments and high-fidelity numerical simulations, affordable computing (data processing) power and memory, and progress in machine learning methods, particularly in deep learning using multilayer neural networks. We introduced a classification of machine learning frameworks for thermal fluid simulation, including five types. The selection of the optimal ML framework is problem-dependent, and to a substantial extent, depends on characteristics of supporting data, including data source, data type, data quantity, and quality. Although examples of Type I and Type II models existed in the literature, their developments are still in infancy. While technically straightforward, both Type I and Type II models are limited to systems whose governing physics are open to “divide-to-conquer”. In Type III, PDEs are involved in the training of machine-learning models, thus alleviating the requirements on the scale separation assumption, and potentially reducing the necessity on the physics decomposition. Correspondingly, Type III models present more stringent requirements on modeling and substantially higher computing resources for training. Based on insights from the case study performed, Type III ML has the highest potential in extracting the value from “big data” in thermal fluid research, while ensuring data-model consistency. There are technical challenges that need to be addressed before Type III models deliver their promises in practical thermal fluid simulation, namely, ➢ Complex interactions of ML-based closures with a system of PDEs (including discontinuity in hyperbolic systems); ➢ Effect of the non-local character of ML-based models on PDE solution methods; and ➢ Implementation and effect of multiple closure models, particularly in multiphase and thermal flows.

Abadi M., Agarwal A., Barham P., Brevdo E., Chen Z., Citro C., Corrado G.S., Davis A., Dean J., Devin M., others, Tensor flow: Large-scale machine learning on heterogeneous distributed systems, (2016). 325 Kingma D.P., Ba J., Adam: A Method for Stochastic Optimization, (2014). 324

246

11

Appendix A

11.1 Routine for Inverse Distance Weighted Interpolation (Shepard’s Method) #include #include #include #include #include "cse.h" #define Large 1.0e+30 #define Small -1.0e+30 void RHS (double dx[],double dy[],double dz[],int num,double dum_x, double dum_y,double dum_z,double omega[]) { int i ; double dx_new , dy_new , dz_new , DX , DY , DZ; dum_x = dum_y = dum_z = 0.0 ; for ( i = 0 ; i < num ; i++ ){ dx_new = dx[i] ; dz_new = dz[i] ; if (dz_new hmax ) ? h[i]: hmax ; omega[i] = 1.0 ; }

247

for ( i = 0 ; i < num ; i++ ){ F[i] = ((hmax -h[i])/hmax*h[i]) * ((hmax -h[i])/hmax*h[i]) ; F2 += F[i] ; }

}

for ( i = 0 ; i < num ; i++ ){ omega[i] = F[i]/F2 ; if (h[i] == 0.0 ) omega[i] = 1.0 ; if (omega[i] < 0.0 || omega[i] > 1.0 ) { fprintf(stderr, " omega[i] = %.3lf \n", omega[i]); fprintf(stderr, " Error - The weight function should be between 0 and 1\n"); exit(1) ; } }

double Get_R (double x[],double y[],double z[],int num ) { double xmax,ymax,zmax,xmin,ymin,zmin,R; xmax = ymax = zmax = Small ; xmin = ymin = zmin = Large ; for ( int i = 0 ; i < num ; i++ ){ xmax = (x[i]> xmax ) ? x[i]: xmax ; ymax = (y[i]> ymax ) ? y[i]: ymax ; zmax = (z[i]> zmax ) ? z[i]: zmax ; xmin = (x[i]< xmin ) ? x[i]: xmin ; ymin = (y[i]< ymin ) ? y[i]: ymin ; zmin = (z[i]< zmin ) ? z[i]: zmin ; } return R = sqrt ((xmax-xmin)*(xmax-xmin) + (ymax-ymin)*(ymax-ymin) + (zmax-zmin)*(zmax-zmin)) ; } void EIDW (int num_interface_nodes1, int num_nodes1, int num_outer_nodes1) /* current global search is insufficient and CPU intensive and should be modified to a more localized search method */ { double dum_x,dum_y,dum_z, omega[num_interface_nodes1],h[num_interface_nodes1], xx[2],yy[2],zz[2],xmin,xmax,ymin,ymax,zmin,zmax,R_solid,R_fluid,hmin, d1,d2,d3,box; int num_nodes,num_interface_nodes,line,i_bar,i,j,ii,n_solid; bool skip ; double dx[num_interface_nodes1],dy[num_interface_nodes1],dz[num_interface_nodes1]; double x[num_nodes1],y[num_nodes1],z[num_nodes1];

248

// Read the data (test - VTK format / CSE ) int index = 0 ; FILE* file_ptr ; file_ptr = fopen("dum0", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum0 file.\n") ; exit (1) ; } int n0 ;

;

fscanf (file_ptr, "%d\n", & n0); for (line = 0 ; line < n0/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0 ; printf(" Finish reading the dum0 file. \n"); fclose (file_ptr) ; file_ptr = fopen("dum1", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum1 file.\n") ; exit (1) ; } int n1 ; fscanf (file_ptr, "%d\n", &n1); for (line = 0 ; line < n1/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1])

;

for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0+n1; dx[index-1]= -4.6567497253 ; dy[index-1]= -0.0067161722109 ; dz[index-1]= -0.55055594444 ; printf(" Finish reading the dum1 file. \n");

249

fclose (file_ptr) ; //FILE* file_ptr ; file_ptr = fopen("dum2", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum2 file.\n") ; exit (1) ; } int n2 ; fscanf (file_ptr, "%d\n", &n2); for (line = 0 ; line < n2/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ; for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; int index = n0+n1 ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0+n1+ n2; printf(" Finish reading the dum2 file. \n"); fclose (file_ptr) ; //FILE* file_ptr ; file_ptr = fopen("dum3", "r") ; if (file_ptr == NULL) { printf (" ** Error opening dum3 file.\n") ; exit (1) ; } int n3 ; fscanf (file_ptr, "%d\n", &n3); for (line = 0 ; line < n3/2 ; line++) { fscanf (file_ptr,"%lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ;

for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; int index = n0+n1+n2 ; if (line > 0 ) i = line * 2 + i_bar ; dx[i+index] = xx[i_bar] ; dy[i+index] = yy[i_bar] ; dz[i+index] = zz[i_bar] ; } } index = n0+n1+ n2+n3; num_interface_nodes = index ; printf(" Finish reading the dum3 file. \n");

250

fclose (file_ptr) ; printf(" n0 n1 n2 n3 = %d,%d,%d,%d \n", n0,n1,n2,n3); printf(" num_interface_nodes = %d \n", num_interface_nodes); file_ptr = fopen("internal", "r") ; if (file_ptr == NULL) { printf (" ** Error opening internal file\n.") ; exit (1) ; } fscanf (file_ptr, "%d\n", & num_nodes); // check for memory int *cfdpointer ; cfdpointer = (int*) malloc (sizeof(num_nodes)); if (cfdpointer == NULL) { printf (" num_nodes = %d \n", num_nodes); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } for (line = 0 ; line < num_nodes/2 ; line++) { fscanf (file_ptr," %lf %lf %lf %lf %lf %lf",&xx[0],&yy[0],&zz[0],&xx[1],&yy[1],&zz[1]) ; for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { i = i_bar ; if (line > 0 ) i = line * 2 + i_bar ; x[i] = xx[i_bar] ; y[i] = yy[i_bar] ; z[i] = zz[i_bar] ; } } fclose (file_ptr) ; printf(" Finished reading the internal file. \n"); printf(" num_nodes = %d \n", num_nodes); // get max /min xmax = ymax = zmax = Small ; xmin = ymin = zmin = Large ; for ( i = 0 ; i < num_interface_nodes ; i++ ){ xmax = (dx[i]> xmax ) ? dx[i]: xmax ; ymax = (dy[i]> ymax ) ? dy[i]: ymax ; zmax = (dz[i]> zmax ) ? dz[i]: zmax ; xmin = (dx[i]< xmin ) ? dx[i]: xmin ; ymin = (dy[i]< ymin ) ? dy[i]: ymin ; zmin = (dz[i]< zmin ) ? dz[i]: zmin ; }

251

R_solid = Get_R (dx,dy,dz,num_interface_nodes) ; R_fluid = Get_R (x,y,z,num_nodes) ; // loop for each cfd field ( EIDW - Shepard's Method) n_solid = 0 ; box = R_fluid; for (j = 0 ; j < num_nodes ; j++) { skip = false ; double xf = x[j] ; double yf = y[j] ; double zf = z[j] ; // get Euclidian distances and normalized weights hmin = Large ; for ( i = 0 ; i < num_interface_nodes ; i++ ){ d1 = xf - dx[i] ; d2 = yf - dy[i] ; d3 = zf - dz[i] ; h[i] = sqrt(d1*d1 + d2*d2 + d3*d3) ; hmin = (h[i] < hmin ) ? h[i]: hmin ; } printf (" pass 1 j = %d \n",j) ; // get weight function values get_weight (h , omega , num_interface_nodes) ; RHS (dx,dy,dz,num_interface_nodes,dum_x,dum_y,dum_z,omega) ; // update new field positions for cfd if (skip) { x[j] = xf ; y[j] = yf ; z[j] = zf ; } else { x[j] = dum_x + x[j] ; y[j] = dum_y + y[j] ; z[j] = dum_z + z[j] ; } } printf (" end of big loop .....\n"); // output (test - VTK format / CSE ) file_ptr = fopen("fluid_mesh_new", "w") ; if (file_ptr == NULL) { printf (" ** Error opening cfd_mesh_new file to write.") ; exit (1) ; } printf (" trying to write \n") ; printf (" cfd.num_nodes = %d \n", num_nodes) ;

252

for (line = 0 ; line < num_nodes/2 ; line++) { for (i_bar = 0 ; i_bar < 2 ; i_bar++ ) { ii = i_bar ; if (line > 0 ) ii = line * 2 + i_bar ; xx[i_bar] = x[ii]; yy[i_bar] = y[ii]; zz[i_bar] = z[ii]; } fprintf (file_ptr," %.11f %.11f %.11f %.11f %.11f %.11f\n", xx[0],yy[0],zz[0],xx[1],yy[1],zz[1]) ; } fclose (file_ptr) ; printf(" Done...\n"); } int main() { #define num_interface_nodes1 30000 #define num_nodes1 400000 #define num_outer_nodes1 50000 // check memory requirements int * Workarray = NULL; Workarray = (int*) malloc (sizeof(num_interface_nodes1)); if (NULL == Workarray) { printf (" num_interface_nodes1 = %d \n", num_interface_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } Workarray = (int*) malloc (sizeof(num_nodes1)); if (Workarray == NULL) { printf (" num_nodes1 = %d \n", num_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } Workarray = (int*) malloc (sizeof(num_outer_nodes1)); if (Workarray == NULL) { printf (" num_outer_nodes1 = %d \n", num_outer_nodes1); printf (" **Error - could not allocate memory for cfd data.\n"); exit (1); } EIDW (num_interface_nodes1,num_nodes1,num_outer_nodes1) ; return 0 ; }

Special Topics in CFD

Special Topics in CFD

Suggest Documents