@Article{Collier_JAMES_20181101,
 author		= {Nathan Collier and Forrest M. Hoffman and David M. Lawrence and Gretchen Keppel-Aleks and Charles D. Koven and William J. Riley and Mingquan Mu and James T. Randerson},
 title		= {The {I}nternational {L}and {M}odel {B}enchmarking ({ILAMB}) System: Design, Theory, and Implementation},
 journal	= JAMES,
 volume		= 10,
 number		= 11,
 pages		= {2731--2754},
 doi		= {10.1029/2018MS001354},
 day		= 1,
 month		= nov,
 year		= 2018,
 abstract	= {The increasing complexity of Earth system models (ESMs) has inspired efforts to quantitatively assess model fidelity through rigorous comparison with best-available measurements and observational data products. ESMs exhibit a high degree of spread in predictions of land biogeochemistry, biogeophysics, and hydrology, which are sensitive to forcing from other model components. Based on insights from prior land model evaluation studies and community workshops, the authors developed an open source model benchmarking software package that generates graphical diagnostics and scores model performance in support of the International Land Model Benchmarking (ILAMB) project. Employing a suite of in situ, remote sensing, and reanalysis datasets, the ILAMB package performs comprehensive model assessment across a wide range of land variables and generates a hierarchical set of webpages containing statistical analyses and figures designed to provide the user insights into strengths and weaknesses of multiple models or model versions. Described here is the benchmarking philosophy and mathematical methodology embodied in the most recent implementation of the ILAMB package. Comparison methods unique to a few specific datasets are presented, and guidelines for configuring an ILAMB analysis and interpreting resulting model performance scores are discussed. ILAMB is being adopted by modeling teams and centers during model development and for model intercomparison projects, and community engagement is sought for extending evaluation metrics and adding new observational datasets to the benchmarking framework.}
}