Spaces:
Sleeping
Sleeping
| # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import collections | |
| import math | |
| import os | |
| import pathlib | |
| import re | |
| import pynvml | |
| pynvml.nvmlInit() | |
| def systemGetDriverVersion(): | |
| return pynvml.nvmlSystemGetDriverVersion() | |
| def deviceGetCount(): | |
| return pynvml.nvmlDeviceGetCount() | |
| class device: | |
| # assume nvml returns list of 64 bit ints | |
| _nvml_affinity_elements = math.ceil(os.cpu_count() / 64) | |
| def __init__(self, device_idx): | |
| super().__init__() | |
| self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx) | |
| def getName(self): | |
| return pynvml.nvmlDeviceGetName(self.handle) | |
| def getCpuAffinity(self): | |
| affinity_string = '' | |
| for j in pynvml.nvmlDeviceGetCpuAffinity( | |
| self.handle, device._nvml_affinity_elements | |
| ): | |
| # assume nvml returns list of 64 bit ints | |
| affinity_string = '{:064b}'.format(j) + affinity_string | |
| affinity_list = [int(x) for x in affinity_string] | |
| affinity_list.reverse() # so core 0 is in 0th element of list | |
| ret = [i for i, e in enumerate(affinity_list) if e != 0] | |
| return ret | |
| def set_socket_affinity(gpu_id): | |
| dev = device(gpu_id) | |
| affinity = dev.getCpuAffinity() | |
| os.sched_setaffinity(0, affinity) | |
| def set_single_affinity(gpu_id): | |
| dev = device(gpu_id) | |
| affinity = dev.getCpuAffinity() | |
| os.sched_setaffinity(0, affinity[:1]) | |
| def set_single_unique_affinity(gpu_id, nproc_per_node): | |
| devices = [device(i) for i in range(nproc_per_node)] | |
| socket_affinities = [dev.getCpuAffinity() for dev in devices] | |
| siblings_list = get_thread_siblings_list() | |
| siblings_dict = dict(siblings_list) | |
| # remove siblings | |
| for idx, socket_affinity in enumerate(socket_affinities): | |
| socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) | |
| affinities = [] | |
| assigned = [] | |
| for socket_affinity in socket_affinities: | |
| for core in socket_affinity: | |
| if core not in assigned: | |
| affinities.append([core]) | |
| assigned.append(core) | |
| break | |
| os.sched_setaffinity(0, affinities[gpu_id]) | |
| def set_socket_unique_affinity(gpu_id, nproc_per_node, mode): | |
| device_ids = [device(i) for i in range(nproc_per_node)] | |
| socket_affinities = [dev.getCpuAffinity() for dev in device_ids] | |
| siblings_list = get_thread_siblings_list() | |
| siblings_dict = dict(siblings_list) | |
| # remove siblings | |
| for idx, socket_affinity in enumerate(socket_affinities): | |
| socket_affinities[idx] = list(set(socket_affinity) - set(siblings_dict.values())) | |
| socket_affinities_to_device_ids = collections.defaultdict(list) | |
| for idx, socket_affinity in enumerate(socket_affinities): | |
| socket_affinities_to_device_ids[tuple(socket_affinity)].append(idx) | |
| for socket_affinity, device_ids in socket_affinities_to_device_ids.items(): | |
| devices_per_group = len(device_ids) | |
| cores_per_device = len(socket_affinity) // devices_per_group | |
| for group_id, device_id in enumerate(device_ids): | |
| if device_id == gpu_id: | |
| if mode == 'interleaved': | |
| affinity = list(socket_affinity[group_id::devices_per_group]) | |
| elif mode == 'continuous': | |
| affinity = list(socket_affinity[group_id*cores_per_device:(group_id+1)*cores_per_device]) | |
| else: | |
| raise RuntimeError('Unknown set_socket_unique_affinity mode') | |
| # reintroduce siblings | |
| affinity += [siblings_dict[aff] for aff in affinity if aff in siblings_dict] | |
| os.sched_setaffinity(0, affinity) | |
| def get_thread_siblings_list(): | |
| path = '/sys/devices/system/cpu/cpu*/topology/thread_siblings_list' | |
| thread_siblings_list = [] | |
| pattern = re.compile(r'(\d+)\D(\d+)') | |
| for fname in pathlib.Path(path[0]).glob(path[1:]): | |
| with open(fname) as f: | |
| content = f.read().strip() | |
| res = pattern.findall(content) | |
| if res: | |
| pair = tuple(map(int, res[0])) | |
| thread_siblings_list.append(pair) | |
| return thread_siblings_list | |
| def set_affinity(gpu_id, nproc_per_node, mode='socket'): | |
| if mode == 'socket': | |
| set_socket_affinity(gpu_id) | |
| elif mode == 'single': | |
| set_single_affinity(gpu_id) | |
| elif mode == 'single_unique': | |
| set_single_unique_affinity(gpu_id, nproc_per_node) | |
| elif mode == 'socket_unique_interleaved': | |
| set_socket_unique_affinity(gpu_id, nproc_per_node, 'interleaved') | |
| elif mode == 'socket_unique_continuous': | |
| set_socket_unique_affinity(gpu_id, nproc_per_node, 'continuous') | |
| else: | |
| raise RuntimeError('Unknown affinity mode') | |
| affinity = os.sched_getaffinity(0) | |
| return affinity | |