You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

launcher.py 3.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. # -*- coding: utf-8 -*-
  2. # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  3. #
  4. # Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  5. #
  6. # Unless required by applicable law or agreed to in writing,
  7. # software distributed under the License is distributed on an
  8. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. import functools
  10. import multiprocessing as mp
  11. from ..core._imperative_rt.core2 import sync
  12. from .group import group_barrier, init_process_group
  13. from .helper import get_device_count_by_fork
  14. from .server import Server
  15. from .util import get_free_ports
  16. def _run_wrapped(
  17. func, is_multimachine, master_ip, port, world_size, rank, dev, args, kwargs
  18. ):
  19. """Init distributed process group and run wrapped function."""
  20. init_process_group(
  21. master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=dev
  22. )
  23. if is_multimachine:
  24. group_barrier()
  25. func(*args, **kwargs)
  26. sync()
  27. if is_multimachine:
  28. group_barrier()
  29. class launcher:
  30. """Decorator for launching multiple processes in single-machine multi-gpu training.
  31. :param func: the function you want to launch in distributed mode.
  32. :param n_gpus: how many devices each node.
  33. :param world_size: how many devices totally.
  34. :param rank_start: start number for rank.
  35. :param master_ip: ip address for master node (where the rank 0 is).
  36. :param port: server port for distributed server.
  37. """
  38. def __new__(cls, *args, **kwargs):
  39. if not args:
  40. return functools.partial(cls, **kwargs)
  41. return super().__new__(cls)
  42. def __init__(
  43. self,
  44. func,
  45. n_gpus=None,
  46. world_size=None,
  47. rank_start=0,
  48. master_ip="localhost",
  49. port=0,
  50. ):
  51. self.func = func
  52. self.n_gpus = n_gpus if n_gpus is not None else get_device_count_by_fork("gpu")
  53. self.world_size = world_size if world_size is not None else self.n_gpus
  54. self.rank_start = rank_start
  55. self.master_ip = master_ip
  56. self.port = port
  57. # master node create server
  58. if self.rank_start == 0:
  59. self.server = Server(self.port)
  60. self.port = self.server.py_server_port
  61. else:
  62. assert self.port != 0, "you have to assign a port for distributed server"
  63. def __call__(self, *args, **kwargs):
  64. procs = []
  65. for dev in range(self.n_gpus):
  66. p = mp.Process(
  67. target=_run_wrapped,
  68. args=(
  69. self.func,
  70. self.world_size > self.n_gpus,
  71. self.master_ip,
  72. self.port,
  73. self.world_size,
  74. dev + self.rank_start,
  75. dev,
  76. args,
  77. kwargs,
  78. ),
  79. )
  80. p.start()
  81. procs.append(p)
  82. devs = list(range(self.n_gpus))
  83. while len(devs) > 0:
  84. left = []
  85. # check all processes in one second
  86. time_to_wait = 1.0 / len(devs)
  87. for dev in devs:
  88. procs[dev].join(time_to_wait)
  89. code = procs[dev].exitcode
  90. # terminate processes if one of them has failed
  91. if code != 0 and code != None:
  92. for i in devs:
  93. procs[i].terminate()
  94. assert (
  95. code == 0 or code == None
  96. ), "subprocess {} exit with code {}".format(dev + self.rank_start, code)
  97. if code == None:
  98. left.append(dev)
  99. devs = left

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台