Node.js 最佳实践异常处理

几天前我才开始尝试使用 node.js。我意识到只要程序中有未处理的异常，Node 就会终止。这与我所见过的普通服务器容器不同，在普通服务器容器中，当发生未处理的异常时，只有工作线程死亡，并且容器仍能够接收请求。这引起了一些问题：

是process.on('uncaughtException')防范的唯一有效方法吗？
process.on('uncaughtException')也会在异步进程执行期间捕获未处理的异常吗？
是否存在已构建的模块（例如发送电子邮件或写入文件），在未捕获的异常的情况下可以利用该模块？

我将不胜感激任何向我展示在 node.js 中处理未捕获异常的常见最佳实践的指针 / 文章

node.js exception-handling serverside-javascript

答案

Update: Joyent now has their own guide. The following information is more of a summary:

Safely "throwing" errors

Ideally we'd like to avoid uncaught errors as much as possible, as such, instead of literally throwing the error, we can instead safely"throw" the error using one of the following methods depending on our code architecture:

For synchronous code, if an error happens, return the error:

// Define divider as a syncrhonous function
var divideSync = function(x,y) {
    // if error condition?
    if ( y === 0 ) {
        // "throw" the error safely by returning it
        return new Error("Can't divide by zero")
    }
    else {
        // no error occured, continue on
        return x/y
    }
}

// Divide 4/2
var result = divideSync(4,2)
// did an error occur?
if ( result instanceof Error ) {
    // handle the error safely
    console.log('4/2=err', result)
}
else {
    // no error occured, continue on
    console.log('4/2='+result)
}

// Divide 4/0
result = divideSync(4,0)
// did an error occur?
if ( result instanceof Error ) {
    // handle the error safely
    console.log('4/0=err', result)
}
else {
    // no error occured, continue on
    console.log('4/0='+result)
}

For callback-based (ie. asynchronous) code, the first argument of the callback is err, if an error happens err is the error, if an error doesn't happen then err is null. Any other arguments follow the err argument:

var divide = function(x,y,next) {
    // if error condition?
    if ( y === 0 ) {
        // "throw" the error safely by calling the completion callback
        // with the first argument being the error
        next(new Error("Can't divide by zero"))
    }
    else {
        // no error occured, continue on
        next(null, x/y)
    }
}

divide(4,2,function(err,result){
    // did an error occur?
    if ( err ) {
        // handle the error safely
        console.log('4/2=err', err)
    }
    else {
        // no error occured, continue on
        console.log('4/2='+result)
    }
})

divide(4,0,function(err,result){
    // did an error occur?
    if ( err ) {
        // handle the error safely
        console.log('4/0=err', err)
    }
    else {
        // no error occured, continue on
        console.log('4/0='+result)
    }
})

For eventful code, where the error may happen anywhere, instead of throwing the error, fire the error event instead:

// Definite our Divider Event Emitter
var events = require('events')
var Divider = function(){
    events.EventEmitter.call(this)
}
require('util').inherits(Divider, events.EventEmitter)

// Add the divide function
Divider.prototype.divide = function(x,y){
    // if error condition?
    if ( y === 0 ) {
        // "throw" the error safely by emitting it
        var err = new Error("Can't divide by zero")
        this.emit('error', err)
    }
    else {
        // no error occured, continue on
        this.emit('divided', x, y, x/y)
    }

    // Chain
    return this;
}

// Create our divider and listen for errors
var divider = new Divider()
divider.on('error', function(err){
    // handle the error safely
    console.log(err)
})
divider.on('divided', function(x,y,result){
    console.log(x+'/'+y+'='+result)
})

// Divide
divider.divide(4,2).divide(4,0)

Safely "catching" errors

Sometimes though, there may still be code that throws an error somewhere which can lead to an uncaught exception and a potential crash of our application if we don't catch it safely. Depending on our code architecture we can use one of the following methods to catch it:

When we know where the error is occurring, we can wrap that section in a node.js domain

var d = require('domain').create()
d.on('error', function(err){
    // handle the error safely
    console.log(err)
})

// catch the uncaught errors in this asynchronous or synchronous code block
d.run(function(){
    // the asynchronous or synchronous code that we want to catch thrown errors on
    var err = new Error('example')
    throw err
})

If we know where the error is occurring is synchronous code, and for whatever reason can't use domains (perhaps old version of node), we can use the try catch statement:

// catch the uncaught errors in this synchronous code block
// try catch statements only work on synchronous code
try {
    // the synchronous code that we want to catch thrown errors on
    var err = new Error('example')
    throw err
} catch (err) {
    // handle the error safely
    console.log(err)
}

However, be careful not to use try...catch in asynchronous code, as an asynchronously thrown error will not be caught:

try {
    setTimeout(function(){
        var err = new Error('example')
        throw err
    }, 1000)
}
catch (err) {
    // Example error won't be caught here... crashing our app
    // hence the need for domains
}

If you do want to work with try..catch in conjunction with asynchronous code, when running Node 7.4 or higher you can use async/await natively to write your asynchronous functions.

Another thing to be careful about with try...catch is the risk of wrapping your completion callback inside the try statement like so:

var divide = function(x,y,next) {
    // if error condition?
    if ( y === 0 ) {
        // "throw" the error safely by calling the completion callback
        // with the first argument being the error
        next(new Error("Can't divide by zero"))
    }
    else {
        // no error occured, continue on
        next(null, x/y)
    }
}

var continueElsewhere = function(err, result){
        throw new Error('elsewhere has failed')
}

try {
        divide(4, 2, continueElsewhere)
        // ^ the execution of divide, and the execution of 
        //   continueElsewhere will be inside the try statement
}
catch (err) {
        console.log(err.stack)
        // ^ will output the "unexpected" result of: elsewhere has failed
}

This gotcha is very easy to do as your code becomes more complex. As such, it is best to either use domains or to return errors to avoid (1) uncaught exceptions in asynchronous code (2) the try catch catching execution that you don't want it to. In languages that allow for proper threading instead of JavaScript's asynchronous event-machine style, this is less of an issue.

Finally, in the case where an uncaught error happens in a place that wasn't wrapped in a domain or a try catch statement, we can make our application not crash by using the uncaughtException listener (however doing so can put the application in an unknown state):

// catch the uncaught errors that weren't wrapped in a domain or try catch statement
// do not use this in modules, but only in applications, as otherwise we could have multiple of these bound
process.on('uncaughtException', function(err) {
    // handle the error safely
    console.log(err)
})

// the asynchronous or synchronous code that emits the otherwise uncaught error
var err = new Error('example')
throw err

以下是该主题的许多不同来源的摘要和精选，包括代码示例和某些博客文章的引文。最佳做法的完整列表可以在这里找到

Node.JS 错误处理的最佳实践

编号 1：使用诺言进行异步错误处理

TL; DR：以回调方式处理异步错误可能是通向地狱（也称为 “末日金字塔”）的最快方法。您可以为代码提供的最好的礼物是使用信誉良好的 Promise 库，该库提供非常紧凑和熟悉的代码语法，例如 try-catch

否则：由于错误处理与临时代码，过多的嵌套和笨拙的编码模式相结合，Node.JS 回调样式，函数（错误，响应）是一种无法维护的代码的有前途的方法

代码示例 - 好

doWork()
.then(doWork)
.then(doError)
.then(doWork)
.catch(errorHandler)
.then(verify);

代码示例反模式–回调样式错误处理

getData(someParameter, function(err, result){
    if(err != null)
      //do something like calling the given callback function and pass the error
    getMoreData(a, function(err, result){
          if(err != null)
            //do something like calling the given callback function and pass the error
        getMoreData(b, function(c){ 
                getMoreData(d, function(e){ 
                    ...
                });
            });
        });
    });
});

博客语录：“我们在承诺方面有问题” （来自博客 pouchdb，关键字 “节点承诺” 排名 11）

“…… 事实上，回调的作用更加险恶：它们剥夺了我们的堆栈，这在编程语言中通常是我们所理所当然的。没有堆栈的代码编写就像在没有刹车的情况下驾驶汽车：直到达到并没有达到目标时，才意识到自己有多急。 诺言的全部目的是让我们恢复异步时丢失的语言基础：返回，抛出和堆栈。必须知道如何正确使用承诺才能利用它们。 ”

2 号：仅使用内置的 Error 对象

TL; DR：看到将错误作为字符串或自定义类型抛出的代码非常普遍–这使错误处理逻辑和模块之间的互操作性变得复杂。无论您拒绝承诺，引发异常还是发出错误 - 使用 Node.JS 内置的 Error 对象，可以提高一致性并防止丢失错误信息

否则：在执行某个模块时，由于不确定返回哪种类型的错误，因此很难对即将到来的异常进行推理和处理。甚至值得，使用自定义类型描述错误可能会导致丢失关键错误信息，例如堆栈跟踪！

代码示例 - 正确执行

//throwing an Error from typical function, whether sync or async
 if(!productToAdd)
 throw new Error("How can I add new product when no value provided?");

//'throwing' an Error from EventEmitter
const myEmitter = new MyEmitter();
myEmitter.emit('error', new Error('whoops!'));

//'throwing' an Error from a Promise
 return new promise(function (resolve, reject) {
 DAL.getProduct(productToAdd.id).then((existingProduct) =>{
 if(existingProduct != null)
 return reject(new Error("Why fooling us and trying to add an existing product?"));

代码示例反模式

//throwing a String lacks any stack trace information and other important properties
if(!productToAdd)
    throw ("How can I add new product when no value provided?");

博客引用：“字符串不是错误” （来自博客 devthought，关键字 “Node.JS error object” 的排名为 6）

“… 传递字符串而不是错误会导致模块之间的互操作性降低。它破坏了与可能正在执行错误检查实例或想要了解更多有关错误的 API 的契约 。正如我们将看到的，错误对象具有除了保留传递给构造函数的消息外，现代 JavaScript 引擎中的有趣属性。”

3 号：区分操作错误与程序员错误

TL; DR：操作错误（例如，API 接收到无效输入）是指可以充分理解并可以深思熟虑地处理错误影响的已知情况。另一方面，程序员错误（例如，尝试读取未定义的变量）是指未知的代码错误，这些错误指示必须正常重启应用程序

否则：您可能总是在出现错误时重新启动应用程序，但是为什么由于次要和预期的错误（操作错误）而导致约 5000 个在线用户失望？相反也不是理想的选择 - 在发生未知问题（程序员错误）时保持应用程序正常运行可能会导致意外行为。区分两者允许根据给定的上下文采取行动并采取平衡的方法

代码示例 - 正确执行

//throwing an Error from typical function, whether sync or async
 if(!productToAdd)
 throw new Error("How can I add new product when no value provided?");

//'throwing' an Error from EventEmitter
const myEmitter = new MyEmitter();
myEmitter.emit('error', new Error('whoops!'));

//'throwing' an Error from a Promise
 return new promise(function (resolve, reject) {
 DAL.getProduct(productToAdd.id).then((existingProduct) =>{
 if(existingProduct != null)
 return reject(new Error("Why fooling us and trying to add an existing product?"));

代码示例 - 将错误标记为可操作（受信任）

//marking an error object as operational 
var myError = new Error("How can I add new product when no value provided?");
myError.isOperational = true;

//or if you're using some centralized error factory (see other examples at the bullet "Use only the built-in Error object")
function appError(commonType, description, isOperational) {
    Error.call(this);
    Error.captureStackTrace(this);
    this.commonType = commonType;
    this.description = description;
    this.isOperational = isOperational;
};

throw new appError(errorManagement.commonErrors.InvalidInput, "Describe here what happened", true);

//error handling code within middleware
process.on('uncaughtException', function(error) {
    if(!error.isOperational)
        process.exit(1);
});

博客语录：“否则，您就要冒状态的风险”（可调试的博客中，关键字 “Node.JS 未捕获的异常” 排名 3）

“ … 从本质上讲，throw 在 JavaScript 中是如何工作的，几乎没有任何方法可以安全地 “从中断的地方开始”，而不会泄漏引用或创建其他未定义的易碎状态。响应的最安全方法抛出的错误是关闭的过程。当然，在一个正常的 Web 服务器，你可能有很多连接打开，因为错误是由其他人触发时，它是不合理的突然关闭这些了。更好的方法是向触发错误的请求发送错误响应，同时让其他人在正常时间内完成操作，并停止监听该工作程序中的新请求”

第四条：通过中间件而不是中间件集中处理错误

TL; DR：错误处理逻辑（例如发给管理员的邮件和日志记录）应封装在一个专用的集中对象中，当出现错误时，所有端点（例如 Express 中间件，cron 作业，单元测试）都将调用该对象。

否则：不在一个地方处理错误将导致代码重复，并可能导致错误处理错误

代码示例 - 典型错误流

//DAL layer, we don't handle errors here
DB.addDocument(newCustomer, (error, result) => {
    if (error)
        throw new Error("Great error explanation comes here", other useful parameters)
});

//API route code, we catch both sync and async errors and forward to the middleware
try {
    customerService.addNew(req.body).then(function (result) {
        res.status(200).json(result);
    }).catch((error) => {
        next(error)
    });
}
catch (error) {
    next(error);
}

//Error handling middleware, we delegate the handling to the centrzlied error handler
app.use(function (err, req, res, next) {
    errorHandler.handleError(err).then((isOperationalError) => {
        if (!isOperationalError)
            next(err);
    });
});

博客引用： “有时较低的级别除了将错误传播给调用者之外，无济于事”（在博客 Joyent 中，关键字 “Node.JS 错误处理” 排名 1）

“…… 您可能最终会在堆栈的多个级别上处理相同的错误。当较低的级别除了将错误传播给其调用者，将错误传播给其调用者等无法执行任何其他操作时，就会发生这种情况。通常，只有顶层调用者知道什么是适当的响应，无论是重试操作，向用户报告错误还是其他，但这并不意味着您应该尝试将所有错误报告给单个顶层回调，因为该回调本身无法知道在什么情况下发生了错误”

5：使用 Swagger 记录文档 API 错误

TL; DR：让您的 API 调用者知道哪些错误可能会返回，以便他们可以认真处理这些错误而不会崩溃。这通常是通过 REST API 文档框架（例如 Swagger）完成的

否则： API 客户端可能决定崩溃并重新启动，仅是因为他收到了无法理解的错误。注意：API 的调用者可能是您（在微服务环境中非常典型）

博客引用： “您必须告诉调用者可能发生什么错误”（在 Joyent 博客中，关键字 “Node.JS logging” 的排名为 1）

… 我们已经讨论了如何处理错误，但是当您编写新函数时，如何将错误传递给调用函数的代码？ … 如果您不知道会发生什么错误或不知道错误的含义，那么您的程序除非是偶然的，否则是不正确的。因此，如果您要编写新函数，则必须告诉调用者可能发生的错误以及错误的含义。

6 号：当一个陌生人来到小镇时，优雅地关闭该过程

TL; DR：当发生未知错误（开发人员错误，请参阅最佳实践编号 3）时 - 应用程序的健康状况不确定。通常的做法是建议使用 Forever 和 PM2 等 “重新启动器” 工具仔细重新启动该过程

否则：当捕获到一个陌生的异常时，某些对象可能处于故障状态（例如，全局使用的事件发射器，并且由于某些内部故障而不再触发事件），并且所有将来的请求都可能失败或疯狂

代码示例 - 确定是否崩溃

//deciding whether to crash when an uncaught exception arrives
//Assuming developers mark known operational errors with error.isOperational=true, read best practice #3
process.on('uncaughtException', function(error) {
 errorManagement.handler.handleError(error);
 if(!errorManagement.handler.isTrustedError(error))
 process.exit(1)
});


//centralized error handler encapsulates error-handling related logic 
function errorHandler(){
 this.handleError = function (error) {
 return logger.logError(err).then(sendMailToAdminIfCritical).then(saveInOpsQueueIfCritical).then(determineIfOperationalError);
 }

 this.isTrustedError = function(error)
 {
 return error.isOperational;
 }

博客语录： “关于错误处理的三种思路”（来自博客 jsrecipes）

… 关于错误处理，主要有三种思路：1. 让应用程序崩溃并重新启动它。 2. 处理所有可能的错误，永不崩溃。 3. 两者之间的平衡方法

Number7：使用成熟的记录器来提高错误可见性

TL; DR：一组成熟的日志记录工具，例如 Winston，Bunyan 或 Log4J，将加快错误发现和理解的速度。因此，请忘记 console.log。

否则：浏览 console.logs 或手动浏览混乱的文本文件而不使用查询工具或不错的日志查看器，可能会使您忙于工作直到很晚

代码示例 - 运行中的 Winston 记录器

//your centralized logger object
var logger = new winston.Logger({
 level: 'info',
 transports: [
 new (winston.transports.Console)(),
 new (winston.transports.File)({ filename: 'somefile.log' })
 ]
 });

//custom code somewhere using the logger
logger.log('info', 'Test Log Message with some parameter %s', 'some parameter', { anything: 'This is metadata' });

博客引用： “让我们确定一些要求（对于记录器）：”（来自博客 strongblog）

… 让我们确定一些要求（对于记录器）：1. 在每条日志行上打上时间戳。这很容易解释 - 您应该能够分辨出何时发生每个日志条目。 2. 记录格式应易于人类和机器消化。 3. 允许多个可配置的目标流。例如，您可能将跟踪日志写入一个文件，但是遇到错误时，先写入同一文件，然后再写入错误文件并同时发送电子邮件...

第八条：使用 APM 产品发现错误和停机时间

TL; DR：监视和性能产品（aka APM）主动评估您的代码库或 API，以便它们可以自动突出显示您所缺少的错误，崩溃和缓慢的部分

否则：您可能会花费大量精力来衡量 API 性能和停机时间，也许您永远不会知道在现实情况下哪些是您最慢的代码部分，以及它们如何影响 UX

博客引用： “APM 产品细分”（来自博客 Yoni Goldberg）

“……APM 产品包括 3 个主要部分： 1. 网站或 API 监视–可以通过 HTTP 请求持续监视正常运行时间和性能的外部服务。可以在几分钟内设置。以下是一些竞争者：Pingdom，正常运行时间机器人和 New Relic 2 。代码工具–需要在应用程序中嵌入代理才能受益的产品系列，其特点是缓慢的代码检测，异常统计信息，性能监视等，以下是一些选定的竞争者：New Relic，App Dynamics 3. 运营情报仪表板–这些产品线的产品侧重于为操作团队提供指标和精选内容，以帮助轻松掌握应用程序性能，这通常涉及汇总多种信息源（应用程序日志，数据库日志，服务器日志等）和前期仪表板设计工作。以下是一些竞争者：Datadog，Splunk”

上面是简化版本 - 请在此处查看更多最佳做法和示例

您可以捕获未捕获的异常，但用途有限。参见http://debuggable.com/posts/node-js-dealing-with-uncaught-exceptions:4c933d54-1428-443c-928d-4e1ecbdd56cb

monit ， forever或upstart可以用于在崩溃时重新启动节点进程。正常关机是您最好的选择（例如，将所有内存数据保存在未捕获的异常处理程序中）。