@article{phan2025humanity,title={Humanity's Last Exam},author={Phan, Long and Gatti, Alice and Han, Ziwen and Li, Nathaniel and Hu, Josephina and Zhang, Hugh and Zhang, Chen Bo Calvin and Shaaban, Mohamed and Ling, John and Shi, Sean and others},journal={Nature},volume={649},number={8099},pages={1139--1146},year={2026},publisher={Nature Publishing Group UK London},}
2025
arXiv
PRBench: Large-Scale Expert Rubrics for Evaluating High-Stakes Professional Reasoning
Afra Feyza Akyürek, Advait Gosai, Chen Bo Calvin Zhang, Vipul Gupta, Jaehwan Jeong, Anisha Gunjal, Tahseen Rabbani, Maria Mazzone, David Randolph, and 2 more authors
@article{akyurek2025prbench,title={PRBench: Large-Scale Expert Rubrics for Evaluating High-Stakes Professional Reasoning},author={Aky{\"u}rek, Afra Feyza and Gosai, Advait and Zhang, Chen Bo Calvin and Gupta, Vipul and Jeong, Jaehwan and Gunjal, Anisha and Rabbani, Tahseen and Mazzone, Maria and Randolph, David and Meymand, Mohammad Mahmoudi and others},journal={arXiv preprint arXiv:2511.11562},year={2025},}
ICLR
ResearchRubrics: A Benchmark of Prompts and Rubrics for Evaluating Deep Research Agents
Manasi Sharma, Chen Bo Calvin Zhang, Chaithanya Bandi, Clinton Wang, Ankit Aich, Huy Nghiem, Tahseen Rabbani, Ye Htet, Brian Jang, and 2 more authors
The Fourteenth International Conference on Learning Representations (ICLR), 2025
@article{sharma2025researchrubrics,title={ResearchRubrics: A Benchmark of Prompts and Rubrics for Evaluating Deep Research Agents},author={Sharma, Manasi and Zhang, Chen Bo Calvin and Bandi, Chaithanya and Wang, Clinton and Aich, Ankit and Nghiem, Huy and Rabbani, Tahseen and Htet, Ye and Jang, Brian and Basu, Sumana and others},journal={The Fourteenth International Conference on Learning Representations (ICLR)},year={2025},}
ICLR
MoReBench: Evaluating Procedural and Pluralistic Moral Reasoning in Language Models, More Than Outcomes
Yu Ying Chiu, Michael S Lee, Rachel Calcott, Brandon Handoko, Paul Font-Reaulx, Paula Rodriguez, Chen Bo Calvin Zhang, Ziwen Han, Udari Madhushani Sehwag, and 2 more authors
The Fourteenth International Conference on Learning Representations (ICLR), 2025
@article{chiu2025morebench,title={MoReBench: Evaluating Procedural and Pluralistic Moral Reasoning in Language Models, More Than Outcomes},author={Chiu, Yu Ying and Lee, Michael S and Calcott, Rachel and Handoko, Brandon and de Font-Reaulx, Paul and Rodriguez, Paula and Zhang, Chen Bo Calvin and Han, Ziwen and Sehwag, Udari Madhushani and Maurya, Yash and others},journal={The Fourteenth International Conference on Learning Representations (ICLR)},year={2025},}
arXiv
Beyond Seeing: Evaluating Multimodal LLMs on Tool-Enabled Image Perception, Transformation, and Reasoning
Xingang Guo, Utkarsh Tyagi, Advait Gosai, Paula Vergara, Jayeon Park, Ernesto Gabriel Hernandez Montoya, Chen Bo Calvin Zhang, Bin Hu, Yunzhong He, and 2 more authors
@article{guo2025beyond,title={Beyond Seeing: Evaluating Multimodal LLMs on Tool-Enabled Image Perception, Transformation, and Reasoning},author={Guo, Xingang and Tyagi, Utkarsh and Gosai, Advait and Vergara, Paula and Park, Jayeon and Montoya, Ernesto Gabriel Hernandez and Zhang, Chen Bo Calvin and Hu, Bin and He, Yunzhong and Liu, Bing and others},journal={arXiv preprint arXiv:2510.12712},year={2025},}
arXiv
TutorBench: A Benchmark To Assess Tutoring Capabilities Of Large Language Models
Rakshith S Srinivasa, Zora Che, Chen Bo Calvin Zhang, Diego Mares, Ernesto Hernandez, Jayeon Park, Dean Lee, Guillermo Mangialardi, Charmaine Ng, and 2 more authors
@article{srinivasa2025tutorbench,title={TutorBench: A Benchmark To Assess Tutoring Capabilities Of Large Language Models},author={Srinivasa, Rakshith S and Che, Zora and Zhang, Chen Bo Calvin and Mares, Diego and Hernandez, Ernesto and Park, Jayeon and Lee, Dean and Mangialardi, Guillermo and Ng, Charmaine and Cardona, Ed-Yeremai Hernandez and others},journal={arXiv preprint arXiv:2510.02663},year={2025},}
arXiv
SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?
Xiang Deng, Jeff Da, Edwin Pan, Yannis Yiming He, Charles Ide, Kanak Garg, Niklas Lauffer, Andrew Park, Nitin Pasari, and 13 more authors
@article{deng2025swe,title={SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?},author={Deng, Xiang and Da, Jeff and Pan, Edwin and He, Yannis Yiming and Ide, Charles and Garg, Kanak and Lauffer, Niklas and Park, Andrew and Pasari, Nitin and Rane, Chetan and Sampath, Karmini and Krishnan, Maya and Kundurthy, Srivatsa and Hendryx, Sean and Wang, Zifan and Bharadwaj, Vijay and Holm, Jeff and Aluri, Raja and Zhang, Chen Bo Calvin and Jacobson, Noah and Liu, Bing and Kenstler, Brad},journal={arXiv preprint arXiv:2509.16941},year={2025},}
ICLR
Reliable Weak-to-Strong Monitoring of LLM Agents (Oral)
Neil Kale, Chen Bo Calvin Zhang, Kevin Zhu, Ankit Aich, Paula Rodriguez, Scale Red Team, Christina Q Knight, and Zifan Wang
The Fourteenth International Conference on Learning Representations (ICLR), 2025
@article{kale2025reliable,title={Reliable Weak-to-Strong Monitoring of LLM Agents},author={Kale, Neil and Zhang, Chen Bo Calvin and Zhu, Kevin and Aich, Ankit and Rodriguez, Paula and Team, Scale Red and Knight, Christina Q and Wang, Zifan},journal={The Fourteenth International Conference on Learning Representations (ICLR)},year={2025},oral={true}}
arXiv
SHADE-Arena: Evaluating Sabotage and Monitoring in LLM Agents
Jonathan Kutasov, Yuqi Sun, Paul Colognese, Teun Weij, Linda Petrini, Chen Bo Calvin Zhang, John Hughes, Xiang Deng, Henry Sleight, and 3 more authors
@article{kutasov2025shade,title={SHADE-Arena: Evaluating Sabotage and Monitoring in LLM Agents},author={Kutasov, Jonathan and Sun, Yuqi and Colognese, Paul and van der Weij, Teun and Petrini, Linda and Zhang, Chen Bo Calvin and Hughes, John and Deng, Xiang and Sleight, Henry and Tracy, Tyler and Shlegeris, Buck and Benton, Joe},journal={arXiv preprint arXiv:2506.15740},year={2025},}
IEEE
Vehicular Communication Security: Multi-Channel and Multi-Factor Authentication
Marco De Vincenzi, Shuyang Sun, Chen Bo Calvin Zhang, Manuel Garcia, Shaozu Ding, Chiara Bodei, Ilaria Matteucci, Sanjay E Sarma, and Dajiang Suo
@article{de2025vehicular,title={Vehicular Communication Security: Multi-Channel and Multi-Factor Authentication},author={De Vincenzi, Marco and Sun, Shuyang and Zhang, Chen Bo Calvin and Garcia, Manuel and Ding, Shaozu and Bodei, Chiara and Matteucci, Ilaria and Sarma, Sanjay E and Suo, Dajiang},journal={IEEE Transactions on Vehicular Technology},doi={10.1109/TVT.2025.3598113},year={2025},}
ICLR
ORSO: Accelerating Reward Design via Online Reward Selection and Policy Optimization
Chen Bo Calvin Zhang, Zhang-Wei Hong, Aldo Pacchiano, and Pulkit Agrawal
In The Thirteenth International Conference on Learning Representations (ICLR), 2025
@inproceedings{zhang2025orso,title={ORSO: Accelerating Reward Design via Online Reward Selection and Policy Optimization},author={Zhang, Chen Bo Calvin and Hong, Zhang-Wei and Pacchiano, Aldo and Agrawal, Pulkit},booktitle={The Thirteenth International Conference on Learning Representations (ICLR)},year={2025},}
2023
arXiv
Zero-Shot Transfer in Imitation Learning
Alvaro Cauderan, Gauthier Boeshertz, Florian Schwarb, and Chen Bo Calvin Zhang
@article{cauderan2023zero,title={Zero-Shot Transfer in Imitation Learning},author={Cauderan, Alvaro and Boeshertz, Gauthier and Schwarb, Florian and Zhang, Chen Bo Calvin},journal={arXiv preprint arXiv:2310.06710},year={2023},}
ICML
HIP-RL: Hallucinated Inputs for Preference-based Reinforcement Learning in Continuous Domains
Chen Bo Calvin Zhang, and Giorgia Ramponi
In ICML 2023 Workshop: The Many Facets of Preference-Based Learning, 2023
@inproceedings{zhang2023hip,title={HIP-RL: Hallucinated Inputs for Preference-based Reinforcement Learning in Continuous Domains},author={Zhang, Chen Bo Calvin and Ramponi, Giorgia},booktitle={ICML 2023 Workshop: The Many Facets of Preference-Based Learning},year={2023},}