You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

distribute-train-cr.yaml 2.7 kB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. apiVersion: onebrain.oneflow.org/v1alpha1
  2. kind: DistributeTrain
  3. metadata:
  4. name: dt-resnet50
  5. namespace: resnet50
  6. labels:
  7. key: value
  8. spec:
  9. size: 3
  10. image: {{IMAGE}}
  11. imagePullPolicy: IfNotPresent
  12. masterCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model
  13. masterResources:
  14. requests:
  15. nvidia.com/gpu: 2
  16. memory: "16Gi"
  17. cpu: "2"
  18. limits:
  19. nvidia.com/gpu: 2
  20. memory: "16Gi"
  21. cpu: "2"
  22. slaveCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model
  23. slaveResources:
  24. requests:
  25. nvidia.com/gpu: 2
  26. memory: "16Gi"
  27. cpu: "2"
  28. limits:
  29. nvidia.com/gpu: 2
  30. memory: "16Gi"
  31. cpu: "2"
  32. nodeSelector:
  33. kubernetes.io/hostname: node02
  34. env:
  35. - name: ENABLE_USER_OP
  36. value: 'True'
  37. - name: DATA_ROOT
  38. value: '/dataset'
  39. - name: NODE_NUM
  40. value: 3
  41. - name: GPU_NUM_PER_NODE
  42. value: 2
  43. - name: ONEFLOW_DEBUG_MODE
  44. value: ""
  45. - name: TRAIN_DATA_PART_NUM
  46. value: 6
  47. - name: VAL_DATA_PART_NUM
  48. value: 6
  49. - name: NCCL_DEBUG
  50. value: INFO
  51. volumeMounts:
  52. - mountPath: /dataset
  53. name: volume-0
  54. - mountPath: /workspace
  55. name: volume-1
  56. volumes:
  57. - name: volume-0
  58. nfs:
  59. path: /nfs/dubhe-prod/dataset/5/versionFile/V0001/ofrecord/train
  60. server: {{NFS IP}}
  61. - name: volume-1
  62. nfs:
  63. path: /nfs/dubhe-prod/train-manage/1/train-1-20200825173815-v0020
  64. server: {{NFS IP}}
  65. tolerations:
  66. - key: "platform/node-isolate"
  67. operator: "Equal"
  68. value: "prod-isolate-1"
  69. effect: "NoSchedule"

一站式算法开发平台、高性能分布式深度学习框架、先进算法模型库、视觉模型炼知平台、数据可视化分析平台等一系列平台及工具,在模型高效分布式训练、数据处理和可视分析、模型炼知和轻量化等技术上形成独特优势,目前已在产学研等各领域近千家单位及个人提供AI应用赋能