Add timeout injection to faulty agent for testing (pytorch#37485)

rohan-varma · facebook-github-bot · commit d6394183078d · 2020-05-01T23:48:28.000-07:00
Summary: Pull Request resolved: pytorch#37485 Adds arbitrary timeout injection to faulty RPC agent. This is to better test scenarios that need information about how long-running RPCs, such as properly testing RPC timeouts and the profiler in all scenarios. This is done by overriding ProcessGroupAgent's `enqueueSend()` function to inject the timeout. Determining which messages to timeout is done similar to the existing `faulty_messages` by having the user specify a mapping of message to timeout. Added unit tests that verify RPC timeouts work with builtin + TorchScript functions, which was not tested before. ghstack-source-id: 103341662 Test Plan: Added unit tests in `FaultyRpcAgentTest`. Differential Revision: D21296537 fbshipit-source-id: 1dbc21aee14e49780272634e9cbb2b5a448f2896
diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h
@@ -91,6 +91,9 @@ class ProcessGroupAgent : public RpcAgent {
       Message&& message,
       const float rpcTimeoutSeconds = kUnsetRpcTimeout) override;
 
+  // put SendWork into a queue and notify the worker thread
+  virtual void enqueueSend(SendWork work);
+
  private:
   using steady_clock_time_point =
       std::chrono::time_point<std::chrono::steady_clock>;
@@ -145,8 +148,6 @@ class ProcessGroupAgent : public RpcAgent {
   };
 
   void collectNames();
-  // put SendWork into a queue and notify the worker thread
-  void enqueueSend(SendWork work);
   // handle a SendWork request. This serializes the payload inside the work
   // object, and sends the message to the receiver using the underlying
   // ProcessGroup.
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.cpp
@@ -4,6 +4,10 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
+namespace {
+constexpr auto kSecToMsConversion = 1000;
+}
+
 std::string fromVec(const std::vector<char>& vec) {
   return std::string(vec.begin(), vec.end());
 }
@@ -14,14 +18,16 @@ FaultyProcessGroupAgent::FaultyProcessGroupAgent(
     int numSendRecvThreads,
     std::chrono::milliseconds rpcTimeout,
     const std::vector<std::string>& messagesToFail,
+    const std::unordered_map<std::string, float>& messageTypesToDelay,
     int failNumSends)
     : ProcessGroupAgent(
           std::move(workerName),
           std::move(pg),
           numSendRecvThreads,
           rpcTimeout),
       failNumSends_(failNumSends),
-      messageTypesToFail_(parseMessagesToFailInput(messagesToFail)) {}
+      messageTypesToFail_(parseMessagesToFailInput(messagesToFail)),
+      messageTypesToDelay_(parseMessagesToDelay(messageTypesToDelay)) {}
 
 std::vector<MessageType> FaultyProcessGroupAgent::parseMessagesToFailInput(
     const std::vector<std::string>& messagesToFail) const {
@@ -30,21 +36,27 @@ std::vector<MessageType> FaultyProcessGroupAgent::parseMessagesToFailInput(
   // types. We will then check this list of types in the send function to
   // determine whether we should fail or not.
   std::vector<MessageType> messageTypesToFail;
+  messageTypesToFail.reserve(messagesToFail.size());
   for (const auto& msgString : messagesToFail) {
-    if (msgString == "RREF_FORK_REQUEST") {
-      messageTypesToFail.emplace_back(MessageType::RREF_FORK_REQUEST);
-    } else if (msgString == "RREF_CHILD_ACCEPT") {
-      messageTypesToFail.emplace_back(MessageType::RREF_CHILD_ACCEPT);
-    } else if (msgString == "RREF_USER_DELETE") {
-      messageTypesToFail.emplace_back(MessageType::RREF_USER_DELETE);
-    } else if (msgString == "CLEANUP_AUTOGRAD_CONTEXT_REQ") {
-      messageTypesToFail.emplace_back(
-          MessageType::CLEANUP_AUTOGRAD_CONTEXT_REQ);
-    }
+    messageTypesToFail.push_back(messageStringToType(msgString));
   }
   return messageTypesToFail;
 }
 
+std::unordered_map<MessageType, float, std::hash<int>> FaultyProcessGroupAgent::
+    parseMessagesToDelay(const std::unordered_map<std::string, float>&
+                             messageTypesToDelay) const {
+  std::unordered_map<MessageType, float, std::hash<int>> delayMessages;
+  for (const auto& messagePair : messageTypesToDelay) {
+    float delay = messagePair.second;
+    TORCH_CHECK(
+        delay >= 0,
+        "Delays passed to FaultyProcessGroupAgent must be non-negative.")
+    delayMessages.insert({messageStringToType(messagePair.first), delay});
+  }
+  return delayMessages;
+}
+
 std::shared_ptr<FutureMessage> FaultyProcessGroupAgent::send(
     const WorkerInfo& to,
     Message&& message,
@@ -76,13 +88,49 @@ std::shared_ptr<FutureMessage> FaultyProcessGroupAgent::send(
   }
 }
 
+void FaultyProcessGroupAgent::enqueueSend(SendWork work) {
+  float msgDelay = getDelayForMessage(work.message_.type());
+  if (msgDelay != 0) {
+    // Sleep for the specified delay for the message.
+    std::this_thread::sleep_for(std::chrono::milliseconds(
+        static_cast<int>(msgDelay * kSecToMsConversion)));
+  }
+  ProcessGroupAgent::enqueueSend(std::move(work));
+}
+
 bool FaultyProcessGroupAgent::shouldFailMessage(MessageType type) const {
   // Return true if the input message type is in the messageTypesToFail_ list
   return (
       std::find(messageTypesToFail_.begin(), messageTypesToFail_.end(), type) !=
       messageTypesToFail_.end());
 }
 
+float FaultyProcessGroupAgent::getDelayForMessage(MessageType type) const {
+  const auto& it = messageTypesToDelay_.find(type);
+  return it == messageTypesToDelay_.end() ? 0 : it->second;
+}
+
+MessageType FaultyProcessGroupAgent::messageStringToType(
+    const std::string& messageString) const {
+  // Lazily constructed map that returns string to message type mapping
+  static std::unordered_map<std::string, MessageType> msgMap = {
+      {"RREF_FORK_REQUEST", MessageType::RREF_FORK_REQUEST},
+      {"RREF_CHILD_ACCEPT", MessageType::RREF_CHILD_ACCEPT},
+      {"RREF_USER_DELETE", MessageType::RREF_USER_DELETE},
+      {"CLEANUP_AUTOGRAD_CONTEXT_REQ",
+       MessageType::CLEANUP_AUTOGRAD_CONTEXT_REQ},
+      {"PYTHON_REMOTE_CALL", MessageType::PYTHON_REMOTE_CALL},
+      {"PYTHON_CALL", MessageType::PYTHON_CALL},
+      {"SCRIPT_CALL", MessageType::SCRIPT_CALL},
+  };
+  const auto& it = msgMap.find(messageString);
+  TORCH_CHECK(
+      it != msgMap.end(),
+      "No mapping to rpc::MessageType exists for ",
+      messageString);
+  return it->second;
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h b/torch/csrc/distributed/rpc/testing/faulty_process_group_agent.h
@@ -14,17 +14,20 @@ struct FaultyProcessGroupRpcBackendOptions
       float rpc_timeout,
       std::string init_method,
       std::vector<std::string> messages_to_fail,
+      std::unordered_map<std::string, float> messages_to_delay,
       int num_fail_sends = 0)
       : ProcessGroupRpcBackendOptions(
             num_send_recv_threads,
             rpc_timeout,
             std::move(init_method)),
         messagesToFail(std::move(messages_to_fail)),
+        messagesToDelay(std::move(messages_to_delay)),
         numFailSends(num_fail_sends) {
     TORCH_CHECK(numFailSends >= 0, "numFailSends should be non-negative");
   }
 
   std::vector<std::string> messagesToFail;
+  std::unordered_map<std::string, float> messagesToDelay;
   int numFailSends;
 };
 
@@ -36,6 +39,7 @@ class FaultyProcessGroupAgent : public ProcessGroupAgent {
       int numSendRecvThreads,
       std::chrono::milliseconds rpcTimeout,
       const std::vector<std::string>& messagesToFail,
+      const std::unordered_map<std::string, float>& messageTypesToDelay,
       int failNumSends = 0);
 
   // Faulty send function for this class.
@@ -45,6 +49,9 @@ class FaultyProcessGroupAgent : public ProcessGroupAgent {
       const float rpcTimeoutSeconds =
           torch::distributed::rpc::kUnsetRpcTimeout) override;
 
+  // Overrides ProcessGroupAgent's enqueueSend to inject delays.
+  void enqueueSend(SendWork work) override;
+
  protected:
   // This function checks the messageTypesToFail_ to determine whether to use
   // the faulty send or not.
@@ -56,18 +63,32 @@ class FaultyProcessGroupAgent : public ProcessGroupAgent {
   std::vector<MessageType> parseMessagesToFailInput(
       const std::vector<std::string>& messagesToFail) const;
 
+  // Returns amount of time in seconds to delay sending of the given message
+  // type.
+  float getDelayForMessage(MessageType type) const;
+
+  // Parse message types that we should inject arbitrary delays for.
+  std::unordered_map<MessageType, float, std::hash<int>> parseMessagesToDelay(
+      const std::unordered_map<std::string, float>& messageTypesToDelay) const;
+
   // Number of sends to intentionally fail before allowing one to succeed.
   const int failNumSends_;
 
   // Vector of the MessageTypes that we must use the faulty send for. This is
   // parsed based on a list of strings passed in by the python tests.
   const std::vector<MessageType> messageTypesToFail_;
 
+  // Mapping of message types to amount we should delay send for in the ::send()
+  // function.
+  std::unordered_map<MessageType, float, std::hash<int>> messageTypesToDelay_;
+
   // Map to track the number of sends we've failed for each RPC.
   std::unordered_map<std::string, int> failMessageCountMap_;
 
   // Mutex to guard failMessageCountMap_
   std::mutex failMapMutex_;
+
+  MessageType messageStringToType(const std::string& messageString) const;
 };
 } // namespace rpc
 } // namespace distributed
diff --git a/torch/csrc/distributed/rpc/testing/init.cpp b/torch/csrc/distributed/rpc/testing/init.cpp
@@ -36,18 +36,28 @@ PyObject* faulty_agent_init(PyObject* /* unused */) {
       "FaultyProcessGroupRpcBackendOptions",
       rpc_module.attr("ProcessGroupRpcBackendOptions"))
       .def(
-          py::init<int, float, std::string, std::vector<std::string>, int>(),
+          py::init<
+              int,
+              float,
+              std::string,
+              std::vector<std::string>,
+              std::unordered_map<std::string, float>,
+              int>(),
           py::arg("num_send_recv_threads"),
           py::arg("rpc_timeout"),
           py::arg("init_method"),
           py::arg("messages_to_fail"),
+          py::arg("messages_to_delay"),
           py::arg("num_fail_sends"))
       .def_readwrite(
           "num_send_recv_threads",
           &ProcessGroupRpcBackendOptions::numSendRecvThreads)
       .def_readwrite(
           "messages_to_fail",
           &FaultyProcessGroupRpcBackendOptions::messagesToFail)
+      .def_readwrite(
+          "messages_to_delay",
+          &FaultyProcessGroupRpcBackendOptions::messagesToDelay)
       .def_readwrite(
           "num_fail_sends", &FaultyProcessGroupRpcBackendOptions::numFailSends);
 
@@ -59,13 +69,15 @@ PyObject* faulty_agent_init(PyObject* /* unused */) {
               std::shared_ptr<::c10d::ProcessGroup>,
               int,
               std::chrono::milliseconds,
-              std::vector<std::string>,
+              const std::vector<std::string>&,
+              const std::unordered_map<std::string, float>&,
               int>(),
           py::arg("name"),
           py::arg("process_group"),
           py::arg("num_send_recv_threads"),
           py::arg("rpc_timeout"),
           py::arg("messages_to_fail"),
+          py::arg("messages_to_delay"),
           py::arg("failNumSends"))
       .def(
           "join",
diff --git a/torch/distributed/rpc/_testing/faulty_agent_backend_registry.py b/torch/distributed/rpc/_testing/faulty_agent_backend_registry.py
@@ -12,6 +12,7 @@ def _faulty_process_group_construct_rpc_backend_options_handler(
     init_method,
     num_send_recv_threads,
     messages_to_fail,
+    messages_to_delay,
     num_fail_sends,
     **kwargs
 ):
@@ -22,6 +23,7 @@ def _faulty_process_group_construct_rpc_backend_options_handler(
         init_method=init_method,
         num_send_recv_threads=num_send_recv_threads,
         messages_to_fail=messages_to_fail,
+        messages_to_delay=messages_to_delay,
         num_fail_sends=num_fail_sends,
     )
 
@@ -66,6 +68,7 @@ def _faulty_process_group_init_backend_handler(
             rpc_backend_options.num_send_recv_threads,
             timedelta(seconds=rpc_backend_options.rpc_timeout),
             rpc_backend_options.messages_to_fail,
+            rpc_backend_options.messages_to_delay,
             rpc_backend_options.num_fail_sends,
         )
     except Exception as ex:
diff --git a/torch/testing/_internal/dist_utils.py b/torch/testing/_internal/dist_utils.py
@@ -28,7 +28,7 @@ def __init__(self, *args, **kwargs):
 
 
 def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
-              faulty_messages=None):
+              faulty_messages=None, messages_to_delay=None):
     """
     We use this decorator for setting up and tearing down state since
     MultiProcessTestCase runs each `test*` method in a separate process and
@@ -54,6 +54,7 @@ def dist_init(old_test_method=None, setup_rpc=True, clean_shutdown=True,
             setup_rpc=setup_rpc,
             clean_shutdown=clean_shutdown,
             faulty_messages=faulty_messages,
+            messages_to_delay=messages_to_delay,
         )
 
     @wraps(old_test_method)
@@ -70,7 +71,7 @@ def new_test_method(self, *arg, **kwargs):
             and self.rpc_backend
             == rpc.backend_registry.BackendType.FAULTY_PROCESS_GROUP
         ):
-            _build_faulty_backend_options(self, faulty_messages)
+            _build_faulty_backend_options(self, faulty_messages, messages_to_delay)
 
         if setup_rpc:
             rpc.init_rpc(
@@ -100,7 +101,7 @@ def new_test_method(self, *arg, **kwargs):
     num_send_recv_threads=8,
 )
 
-def _build_faulty_backend_options(faulty_agent_fixture, faulty_messages):
+def _build_faulty_backend_options(faulty_agent_fixture, faulty_messages, messages_to_delay):
     '''
     Constructs the backend options object for the faulty process group agent
     based on the faulty_messages input to dist_init.
@@ -110,12 +111,18 @@ def _build_faulty_backend_options(faulty_agent_fixture, faulty_messages):
         if faulty_messages is not None
         else faulty_agent_fixture.retryable_message_types
     )
+    messages_to_delay = (
+        messages_to_delay
+        if messages_to_delay is not None
+        else faulty_agent_fixture.default_messages_to_delay
+    )
     TEST_CONFIG.build_rpc_backend_options = lambda test_object: rpc.backend_registry.construct_rpc_backend_options(
         test_object.rpc_backend,
         init_method=test_object.init_method,
         num_send_recv_threads=8,
         num_fail_sends=faulty_agent_fixture.num_fail_sends,
         messages_to_fail=messages_to_fail,
+        messages_to_delay=messages_to_delay,
     )
 
 
@@ -173,7 +180,7 @@ def get_timeout_error_regex(rpc_backend_name):
     should receive when an RPC has timed out. Useful for use with
     assertRaisesRegex() to ensure we have the right errors during timeout.
     """
-    if rpc_backend_name == "PROCESS_GROUP":
+    if rpc_backend_name in ["PROCESS_GROUP", "FAULTY_PROCESS_GROUP"]:
         return "RPC ran for more than"
     else:
         return "(Timed out)|(Task expired)"
diff --git a/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py b/torch/testing/_internal/distributed/rpc/faulty_rpc_agent_test_fixture.py
@@ -12,6 +12,13 @@
                            "RREF_USER_DELETE",
                            "CLEANUP_AUTOGRAD_CONTEXT_REQ"]
 
+# The following messages incur the corresponding delay in seconds while being
+# processed in FaultyProcessGroupAgent's enqueueSend() function.
+default_messages_to_delay = {
+    "PYTHON_CALL": 1.5,  # Python UDF
+    "SCRIPT_CALL": 1.5,  # Script/Builtin
+}
+
 class FaultyRpcAgentTestFixture(RpcAgentTestFixture):
     @property
     def rpc_backend(self):
@@ -26,3 +33,7 @@ def retryable_message_types(self):
     @property
     def num_fail_sends(self):
         return 3
+
+    @property
+    def default_messages_to_delay(self):
+        return default_messages_to_delay
diff --git a/torch/testing/_internal/distributed/rpc/rpc_test.py b/torch/testing/_internal/distributed/rpc/rpc_test.py