1. Carlos Reaño and Federico Silla. Redesigning the rCUDA communication layer for a better adaptation to the underlying hardware. Concurrency and computation 33(14), 2021. BibTeX

@article{ReañoCarlos2021Rtrc,
abstract = "Summary The use of Graphics Processing Units (GPUs) has become a very popular way to accelerate the execution of many applications. However, GPUs are not exempt from side effects. For instance, GPUs are expensive devices which additionally consume a non‐negligible amount of energy even when they are not performing any computation. Furthermore, most applications present low GPU utilization. To address these concerns, the use of GPU virtualization has been proposed. In particular, remote GPU virtualization is a promising technology that allows applications to transparently leverage GPUs installed in any node of the cluster. In this paper, the remote GPU virtualization mechanism is comparatively analyzed across three different generations of GPUs. The first contribution of this study is an analysis about how the performance of the remote GPU virtualization technique is impacted by the underlying hardware. To that end, the Tesla K20, Tesla K40, and Tesla P100 GPUs along with FDR and EDR InfiniBand fabrics are used in the study. The analysis is performed in the context of the rCUDA middleware. It is clearly shown that the GPU virtualization middleware requires a comprehensive design of its communication layer, which should be perfectly adapted to every hardware generation in order to avoid a reduction in performance. This is precisely the second contribution of this work, ie, redesigning the rCUDA communication layer in order to improve the management of the underlying hardware. Results show that it is possible to improve bandwidth up to 29.43%, which translates into up to 4.81% average less execution time in the performance of the analyzed applications.",
author = "Reaño, Carlos and Silla, Federico",
issn = "1532-0626",
journal = "Concurrency and computation",
keywords = "CUDA ; GPGPU ; HPC ; infiniBand ; virtualization",
language = "eng",
number = 14,
publisher = "Wiley Subscription Services, Inc",
title = "Redesigning the rCUDA communication layer for a better adaptation to the underlying hardware",
volume = 33,
year = 2021
}

2. Sergio Iserte, Javier Prades, Carlos Reaño and Federico Silla. Improving the management efficiency of GPU workloads in data centers through GPU virtualization. Concurrency and computation 33(2), 2021. BibTeX

@article{IserteSergio2021Itme,
abstract = "Summary Graphics processing units (GPUs) are currently used in data centers to reduce the execution time of compute‐intensive applications. However, the use of GPUs presents several side effects, such as increased acquisition costs and larger space requirements. Furthermore, GPUs require a nonnegligible amount of energy even while idle. Additionally, GPU utilization is usually low for most applications. In a similar way to the use of virtual machines, using virtual GPUs may address the concerns associated with the use of these devices. In this regard, the remote GPU virtualization mechanism could be leveraged to share the GPUs present in the computing facility among the nodes of the cluster. This would increase overall GPU utilization, thus reducing the negative impact of the increased costs mentioned before. Reducing the amount of GPUs installed in the cluster could also be possible. However, in the same way as job schedulers map GPU resources to applications, virtual GPUs should also be scheduled before job execution. Nevertheless, current job schedulers are not able to deal with virtual GPUs. In this paper, we analyze the performance attained by a cluster using the remote Compute Unified Device Architecture middleware and a modified version of the Slurm scheduler, which is now able to assign remote GPUs to jobs. Results show that cluster throughput, measured as jobs completed per time unit, is doubled at the same time that the total energy consumption is reduced up to 40%. GPU utilization is also increased.",
author = "Iserte, Sergio and Prades, Javier and Reaño, Carlos and Silla, Federico",
issn = "1532-0626",
journal = "Concurrency and computation",
keywords = "CUDA ; data centers ; GPU ; InfiniBand ; rCUDA ; Slurm ; Virtualization",
language = "eng",
number = 2,
publisher = "Wiley Subscription Services, Inc",
title = "Improving the management efficiency of GPU workloads in data centers through GPU virtualization",
volume = 33,
year = 2021
}

3. Carlos Reaño, Federico Silla and Blesson Varghese. PII: S0743-7315(20)30386-5. Journal of parallel and distributed computing 147:268-269, 2021. BibTeX

@article{ReañoCarlos2021PS,
author = "Reaño, Carlos and Silla, Federico and Varghese, Blesson",
issn = "0743-7315",
journal = "Journal of parallel and distributed computing",
language = "eng",
pages = "268-269",
publisher = "Elsevier Inc",
title = "PII: S0743-7315(20)30386-5",
volume = 147,
year = 2021
}

4. Carlos Reaño, Federico Silla and Blesson Varghese. Accelerator virtualization. Concurrency and computation, 2021. BibTeX

@article{ReañoCarlos2021Av,
author = "Reaño, Carlos and Silla, Federico and Varghese, Blesson",
issn = "1532-0626",
journal = "Concurrency and computation",
language = "eng",
title = "Accelerator virtualization",
year = 2021
}

5. Daniel Hernandez, Juan-Carlos Cano, Federico Silla, Carlos T Calafate and Jose M Cecilia. AI-enabled autonomous drones for fast climate change crisis assessment. IEEE internet of things journal, pages 1-1, 2021. BibTeX

@article{HernandezDaniel2021Aadf,
abstract = "Climate change is one of the greatest challenges for modern societies. Its consequences, often associated with extreme events, have dramatic results worldwide. New synergies between different disciplines including Artificial Intelligence (AI), Internet of Things (IoT), and edge computing can lead to radically new approaches for the real-time tracking of natural disasters that are also designed to reduce the environmental footprint. In this article, we propose an AI-based pipeline for processing natural disaster images taken from drones. The purpose of this pipeline is to reduce the number of images to be processed by the first responders of the natural disaster. It consists of three main stages, (1) a lightweight auto-encoder based on deep learning, (2) a dimensionality reduction using the t-SNE algorithm and (3) a fuzzy clustering procedure. This pipeline is evaluated on several edge computing platforms with low-power accelerators to assess the design of intelligent autonomous drones to provide this service in real time. Our experimental evaluation focuses on flooding, showing that the amount of information to be processed is substantially reduced whereas edge computing platforms with low-power GPUs are placed as a compelling alternative for processing these heavy computational workloads, obtaining a performance loss of only 2.3x compared to its cloud counterpart version, running both the training and inference steps.",
author = "Hernandez, Daniel and Cano, Juan-Carlos and Silla, Federico and Calafate, Carlos T and Cecilia, Jose M",
issn = "2327-4662",
journal = "IEEE internet of things journal",
keywords = "Artificial Vision ; Climate Change ; Cloud computing ; Clustering algorithms ; Deep Learning ; Drones ; Edge computing ; Internet of Things ; Performance evaluation ; Pipelines ; Sustainable ICT ; UAVs",
language = "eng",
pages = "1-1",
publisher = "IEEE",
title = "AI-enabled autonomous drones for fast climate change crisis assessment",
year = 2021
}

6. Javier Prades Gasulla. Improving Performance and Energy Efficiency of Heterogeneous Systems with rCUDA. 2021. BibTeX

@misc{PradesGasullaJavier2021IPaE,
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Cloud Computing ; Computación de altas prestaciones ; Energy Efficiency ; GPGPU ; Graphics processing units (GPU) ; Heterogeneous systems ; High Performance Computing ; HPC ; rCUDA ; Unidades de procesamiento gráfico",
language = "eng",
publisher = "Universitat Politècnica de València",
title = "Improving Performance and Energy Efficiency of Heterogeneous Systems with rCUDA",
year = 2021
}

7. Adrián Castelló, Enrique S Quintana-Ortí and José Duato. Accelerating distributed deep neural network training with pipelined MPI allreduce. Cluster computing 24(4):3797-3813, 2021. BibTeX

@article{CastellóAdrián2021Addn,
abstract = "TensorFlow (TF) is usually combined with the Horovod (HVD) workload distribution package to obtain a parallel tool to train deep neural network on clusters of computers. HVD in turn utilizes a blocking Allreduce primitive to share information among processes, combined with a communication thread to overlap communication with computation. In this work, we perform a thorough experimental analysis to expose (1) the importance of selecting the best algorithm in MPI libraries to realize the Allreduce operation; and (2) the performance acceleration that can be attained when replacing a blocking Allreduce with its non-blocking counterpart (while maintaining the blocking behaviour via the appropriate synchronization mechanism). Furthermore, (3) we explore the benefits of applying pipelining to the communication exchange, demonstrating that these improvements carry over to distributed training via TF+HVD. Finally, (4) we show that pipelining can also boost performance for applications that make heavy use of other collectives, such as Broadcast and Reduce-Scatter.",
author = "Castelló, Adrián and Quintana-Ortí, Enrique S and Duato, José",
issn = "1386-7857",
journal = "Cluster computing",
keywords = "Algorithms ; Analysis ; Article ; Artificial neural networks ; Communication ; Computer Communication Networks ; Computer Science ; Neural networks ; Operating Systems ; Processor Architectures ; Synchronism ; Training ; Usage",
language = "eng",
number = 4,
pages = "3797-3813",
publisher = "Springer US",
title = "Accelerating distributed deep neural network training with pipelined MPI allreduce",
volume = 24,
year = 2021
}

8. Cristina Olmedilla, Jesus Escudero-Sahuquillo, Pedro Javier Garcia-Garcia, Francisco Alfaro-Cortes, Jose L Sanchez, Francisco J Quiles, Wenhao Sun, Xiang Yu, Yonghui Xu and Jose Duato. DVL-Lossy: Isolating Congesting Flows to Optimize Packet Dropping in Lossy Data-Center Networks. IEEE MICRO 41(1):37-44, 2021. BibTeX

@article{OlmedillaCristina2021DICF,
abstract = "The performance of lossy data-center networks (DCNs) may degrade due to packet dropping (and possible retransmission) under congestion. In this article, we propose and evaluate a solution to deal with congestion in lossy DCNs, based on the same approach as the dynamic virtual lanes technique, previously proposed for lossless DCNs. This approach consists of isolating congesting flows in special queues, so that they do not share queues with noncongesting ones. This reduces the probability of standard queues becoming congested, thus reducing the dropping (and retransmission) of noncongesting packets and improving network performance. The experiment results confirm that these benefits are achieved by adding just a single special queue per switch port.",
author = "Olmedilla, Cristina and Escudero-Sahuquillo, Jesus and Garcia-Garcia, Pedro Javier and Alfaro-Cortes, Francisco and Sanchez, Jose L and Quiles, Francisco J and Sun, Wenhao and Yu, Xiang and Xu, Yonghui and Duato, Jose",
issn = "0272-1732",
journal = "IEEE MICRO",
keywords = "Bandwidth ; Congestion Management ; Delays ; Dynamic Virtual Lanes ; Lossy Data-center Networks ; Packet loss ; Standards organizations ; Switches ; Topology",
language = "eng",
number = 1,
pages = "37-44",
publisher = "IEEE",
title = "DVL-Lossy: Isolating Congesting Flows to Optimize Packet Dropping in Lossy Data-Center Networks",
volume = 41,
year = 2021
}

9. Adrian Castello, Mar Catalan, Manuel F Dolz, Jose I Mestre, Enrique S Quintana-Orti and Jose Duato. Performance Modeling for Distributed Training of Convolutional Neural Networks. In 2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP). 2021, 99-108. BibTeX

@inproceedings{CastelloAdrian2021PMfD,
abstract = "We perform a theoretical analysis comparing the scalability of data versus model parallelism, applied to the distributed training of deep convolutional neural networks (CNNs), along five axes: batch size, node (floating-point) arithmetic performance, node memory bandwidth, network link bandwidth, and cluster dimension. Our study relies on analytical performance models that can be configured to reproduce the components and organization of the CNN model as well as the hardware configuration of the target distributed platform. In addition, we provide evidence of the accuracy of the analytical models by performing a validation against a Python library for distributed deep learning training.",
author = "Castello, Adrian and Catalan, Mar and Dolz, Manuel F and Mestre, Jose I and Quintana-Orti, Enrique S and Duato, Jose",
booktitle = "2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
isbn = 9781665414555,
issn = "2377-5750",
keywords = "analytical modeling ; Analytical models ; Bandwidth ; clusters ; Deep neural networks (DNNs) ; distributed training ; Neural networks ; Organizations ; Parallel processing ; Scalability ; Training",
language = "eng",
pages = "99-108",
publisher = "IEEE",
title = "Performance Modeling for Distributed Training of Convolutional Neural Networks",
year = 2021
}

10. Adrian Castello, Mar Catalan, Manuel F Dolz, Jose I Mestre, Enrique S Quintana-Orti and Jose Duato. Evaluation of MPI Allreduce for Distributed Training of Convolutional Neural Networks. In 2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP). 2021, 109-116. BibTeX

@inproceedings{CastelloAdrian2021EoMA,
abstract = "Training deep neural networks is a costly procedure, often performed via sophisticated deep learning frameworks on clusters of computers. As faster processor technologies are integrated into these cluster facilities (e.g., NVIDIA's graphics accelerators or Google's tensor processing units), the communication component of the training process rapidly becomes a performance bottleneck. In this paper, we offer a complete analysis of the key collective communication primitive for the distributed data-parallel training of convolutional network networks (CNNs) focused on three relevant instances of the Message Passing Interface (MPI): MPICH, OpenMPI, and IntelMPI. In addition, our experimental evaluation is extended to expose the practical impact of this collective primitive when the training is performed using TensorFlow+ Horovod on a 16-node cluster. Finally, the theoretical analysis is further refined to a number of accelerated cluster configurations that are emulated by adjusting the communication-arithmetic ratio of the training process.",
author = "Castello, Adrian and Catalan, Mar and Dolz, Manuel F and Mestre, Jose I and Quintana-Orti, Enrique S and Duato, Jose",
booktitle = "2021 29th Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)",
isbn = 9781665414555,
issn = "2377-5750",
keywords = "Allreduce ; collective communication primitives ; Convolutional neural networks ; Deep learning ; distributed training ; Graphics ; Message passing ; Message Passing Interface (MPI) ; Neural networks ; Tensors ; Training",
language = "eng",
pages = "109-116",
publisher = "IEEE",
title = "Evaluation of MPI Allreduce for Distributed Training of Convolutional Neural Networks",
year = 2021
}

11. Li Shen, Wenhao Sun, Xiang Yu and José Duato. Packet Control Method, Flow Table Update Method, and Node Device. 2021. BibTeX

@misc{ShenLi2021PCMF,
abstract = "A packet control method, a flow table update method, and a node device including a first queue and a second queue, where the method includes: obtaining, by the node device, a first packet; determining, by the node device, that a data flow to which the first packet belongs is marked as an isolated flow; and if the first queue and/or the second queue meet and/or meets a first preset condition, controlling, by the node device, the first packet to enter the first queue and wait to be scheduled; or if the first queue and/or the second queue meet and/or meets a second preset condition, controlling, by the node device, the first packet to enter the second queue and wait to be scheduled.",
author = "Shen, Li and Sun, Wenhao and Yu, Xiang and Duato, José",
keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
language = "eng",
title = "Packet Control Method, Flow Table Update Method, and Node Device",
year = 2021
}

12. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. PACKET CONTROL METHOD AND NODE DEVICE. 2021. BibTeX

@misc{YUXiang2021PCMA,
abstract = "The present invention discloses a packet control method and a node device, to improve reliability of a data flow in a transmission process. The method includes: After receiving a pause frame, a first node automatically applies, based on adjustment information that is of a send queue of a data flow and that is recorded in a state record set, the pause frame to all queues associated in an adjustment process of the send queue of the data flow. In this way, a packet loss problem in a data transmission process can be avoided without adjusting an XOFF/XON threshold of a receive queue and without increasing a quantity of pause frames in a network system, thereby improving reliability of the data flow in the transmission process.",
author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
language = "eng",
title = "PACKET CONTROL METHOD AND NODE DEVICE",
year = 2021
}

13. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. MESSAGE CONTROL METHOD AND NODE DEVICE. 2021. BibTeX

@misc{YUXiang2021MCMA,
abstract = "The present invention discloses a packet control method and a node device, to improve reliability of a data flow in a transmission process. The method includes: After receiving a pause frame, a first node automatically applies, based on adjustment information that is of a send queue of a data flow and that is recorded in a state record set, the pause frame to all queues associated in an adjustment process of the send queue of the data flow. In this way, a packet loss problem in a data transmission process can be avoided without adjusting an XOFF/XON threshold of a receive queue and without increasing a quantity of pause frames in a network system, thereby improving reliability of the data flow in the transmission process.",
author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
language = "eng ; fre ; ger",
title = "MESSAGE CONTROL METHOD AND NODE DEVICE",
year = 2021
}

14. Xiang YU, Wenhao SUN, José DUATO and Li SHEN. MESSAGE CONTROL METHOD, FLOW TABLE UPDATING METHOD, AND NODE DEVICE. 2021. BibTeX

@misc{YUXiang2021MCMF,
abstract = "A packet control method, a flow table update method, and a node device are provided. The node device includes a first queue and a second queue. The method includes: obtaining, by the node device, a first packet; determining, by the node device, that a data flow to which the first packet belongs is marked as an isolated flow; and if the first queue and/or the second queue meet and/or meets a first preset condition, controlling, by the node device, the first packet to enter the first queue and wait to be scheduled, or if the first queue and/or the second queue meet and/or meets a second preset condition, controlling, by the node device, the first packet to enter the second queue and wait to be scheduled. In this way, the node device can relatively flexibly control a packet of the data flow, marked as an isolated flow, to enter a queue.",
author = "YU, Xiang and SUN, Wenhao and DUATO, José and SHEN, Li",
keywords = "ELECTRIC COMMUNICATION TECHNIQUE ; ELECTRICITY ; TRANSMISSION OF DIGITAL INFORMATION, e.g. TELEGRAPHICCOMMUNICATION",
language = "eng ; fre ; ger",
title = "MESSAGE CONTROL METHOD, FLOW TABLE UPDATING METHOD, AND NODE DEVICE",
year = 2021
}

15. Tomas Picornell, Jose Flich, Carles Hernandez and Jose Duato. Enforcing Predictability of Many-Cores With DCFNoC. IEEE transactions on computers 70(2):270-283, 2021. BibTeX

@article{PicornellTomas2021EPoM,
abstract = {The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this article we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4\times 4 4×4  setting by a factor of 3.7\times 3.7×  when interference traffic is injected. For a 8\times 8 8×8  network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79 percent over a standard wormhole implementation.},
author = "Picornell, Tomas and Flich, Jose and Hernandez, Carles and Duato, Jose",
issn = "0018-9340",
journal = "IEEE transactions on computers",
keywords = "Delays ; Electronic mail ; Interference ; MPSoCs ; Multiprocessor interconnection ; Real-time systems ; Routing ; safety-critical systems ; Software ; Time division multiplexing ; time division multiplexing (TDM) ; time predictable network",
language = "eng",
number = 2,
pages = "270-283",
publisher = "IEEE",
title = "Enforcing Predictability of Many-Cores With DCFNoC",
volume = 70,
year = 2021
}

16. Giovanni Agosta, William Fornaciari, David Atienza, Ramon Canal, Alessandro Cilardo, José Flich Cardo, Carles Hernandez Luz, Michal Kulczewski, Giuseppe Massari, Rafael Tornero Gavilá and Marina Zapater. The RECIPE approach to challenges in deeply heterogeneous high performance systems. Microprocessors and microsystems 77:103185-, 2020. BibTeX

@article{AgostaGiovanni2020TRat,
abstract = "RECIPE (REliable power and time-ConstraInts-aware Predictive management of heterogeneous Exascale systems) is a recently started project funded within the H2020 FETHPC programme, which is expressly targeted at exploring new High-Performance Computing (HPC) technologies. RECIPE aims at introducing a hierarchical runtime resource management infrastructure to optimize energy efficiency and minimize the occurrence of thermal hotspots, while enforcing the time constraints imposed by the applications and ensuring reliability for both time-critical and throughput-oriented computation that run on deeply heterogeneous accelerator-based systems. This paper presents a detailed overview of RECIPE, identifying the fundamental challenges as well as the key innovations addressed by the project. In particular, the need for predictive reliability approaches to maximizing hardware lifetime and guarantee application performance is identified as the key concern for RECIPE. We address it through hierarchical resource management of the heterogeneous architectural components of the system, driven by estimates of the application latency and hardware reliability obtained respectively through timing analysis and modeling thermal properties and mean-time-to-failure of subsystems. We show the impact of prediction accuracy on the overheads imposed by the checkpointing policy, as well as a possible application to a weather forecasting use case.",
author = "Agosta, Giovanni and Fornaciari, William and Atienza, David and Canal, Ramon and Cilardo, Alessandro and Flich Cardo, José and Hernandez Luz, Carles and Kulczewski, Michal and Massari, Giuseppe and Tornero Gavilá, Rafael and Zapater, Marina",
issn = "0141-9331",
journal = "Microprocessors and microsystems",
keywords = "Computer Science - Distributed, Parallel, and Cluster Computing ; Heterogeneous computing ; HPC ; Run-time management",
language = "eng",
pages = "103185-",
publisher = "Elsevier B.V",
title = "The RECIPE approach to challenges in deeply heterogeneous high performance systems",
volume = 77,
year = 2020
}

17. Juan-José Crespo, José L Sánchez, Francisco J Alfaro-Cortés, José Flich and José Duato. UPR: deadlock-free dynamic network reconfiguration by exploiting channel dependency graph compatibility. The Journal of supercomputing 77(11):12826-12856, 2021. BibTeX

@article{CrespoJuan-José2021Uddn,
abstract = "Deadlock-free dynamic network reconfiguration process is usually studied from the routing algorithm restrictions and resource reservation perspective. The dynamic nature yielded by the transition process from one routing function to another is often managed by restricting resource usage in a static predefined manner, which often limits the supported routing algorithms and/or inactive link patterns, or either requires additional resources such as virtual channels. Exploiting compatibility between routing functions by exploring their associated channel dependency graphs (CDG) leads to a better reconfiguration process given its dynamic nature. In this paper, we propose a new dynamic reconfiguration process called Upstream Progressive Reconfiguration (UPR). Our algorithm progressively performs dependency addition/removal in a per channel basis relying on the information provided by the CDG, while the reconfiguration process takes place. This gives us the opportunity to foresee compatible scenarios where both routing functions coexist, reducing the needed amount of resource drainage as well as packet injection halting.",
author = "Crespo, Juan-José and Sánchez, José L and Alfaro-Cortés, Francisco J and Flich, José and Duato, José",
issn = "0920-8542",
journal = "The Journal of supercomputing",
keywords = "Algorithms ; Article ; Compatibility ; Compilers ; Computer Science ; Computer Science - Networking and Internet Architecture ; general ; Interpreters ; Processor Architectures ; Programming Languages ; Reconfiguration",
language = "eng",
number = 11,
pages = "12826-12856",
publisher = "Springer US",
title = "UPR: deadlock-free dynamic network reconfiguration by exploiting channel dependency graph compatibility",
volume = 77,
year = 2021
}

18. Tomás Picornell-Sanjuan, José Flich Cardo, Carles Hernández Luz and José Francisco Duato Marín. Enforcing Predictability of Many-cores with DCFNoC. , 2021. BibTeX

@article{Picornell-SanjuanTomás2021EPoM,
abstract = "© 2021 IEEE. Personal use of this material is permitted. Permissíon from IEEE must be obtained for all other uses, in any current or future media, including reprinting/republishing this material for advertisíng or promotional purposes, creating new collective works, for resale or redistribution to servers or lists, or reuse of any copyrighted component of this work in other works. [EN] The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this paper we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4 × 4 setting by a factor of 3.7× when interference traffic is injected. For a 8 × 8 network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79% over a standard wormhole implementation. This work has been supported by MINECO under Grant BES-2016-076885, by MINECO and funds from the European ERDF under Grant TIN2015-66972-C05-1-R and Grant RTI2018-098156-B-C51, and by the EC H2020 RECIPE project under Grant 801137. Picornell-Sanjuan, T.; Flich Cardo, J.; Hernández Luz, C.; Duato Marín, JF. (2021). Enforcing Predictability of Many-cores with DCFNoC. IEEE Transactions on Computers. 70(2):270-283. https://doi.org/10.1109/TC.2020.2987797",
author = "Picornell-Sanjuan, Tomás and Flich Cardo, José and Hernández Luz, Carles and Duato Marín, José Francisco",
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; MPSoCs ; Real-time systems ; Safety-critical systems ; Time division multiplexing (TDM) ; Time predictable network",
language = "eng",
publisher = "Institute of Electrical and Electronics Engineers",
title = "Enforcing Predictability of Many-cores with DCFNoC",
year = 2021
}

19. Brian Miguel Mcmullen García. Deepwise Separable Convolution Support in Neural Network Platform. 2021. BibTeX

@misc{McmullenGarcíaBrianMiguel2021DSCS,
author = "Mcmullen García, Brian Miguel",
keywords = "Architectures ; ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Arquitecturas ; Artificial intelligence ; Convoluciones ; Convolution ; Entrenamiento ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Inference ; Inferencia ; Inteligencia artificial ; Training",
language = "eng",
publisher = "Universitat Politècnica de València",
title = "Deepwise Separable Convolution Support in Neural Network Platform",
year = 2021
}

20. Roberto Díaz-Cano Lozano. Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier. 2021. BibTeX

@misc{Díaz-CanoLozanoRoberto2021Ddpd,
abstract = "[CA] La intel·ligència artificial (IA) s’està convertint en un element imprescindible en diferents àmbits de la informàtica. Al mateix temps que la IA s’està desenvolupant a escala d’algoritmes, també les arquitectures de processament s’hi estan adaptant per donar un millor suport. Per aquest motiu en el present treball es desenvolupa un suport en aritmètica en coma flotant de 16 bits sobre una plataforma d’entrenament i inferència de xarxes neuronals. Aquest desenvolupament es realitza sobre el dispositiu Jetson AGX Xavier de l’empresa NVIDIA, el qual està destinat a aplicacions d’intel·ligència artificial, com ara l’aprenentatge profund (deep learning). L’objectiu és dotar a l’aplicació HELENNA d’un suport que li permeta utilitzar els nombres en coma flotant de 16 bits sobre la GPU del dispositiu de NVIDIA, a través del llenguatge de programació CUDA. D’aquesta manera es podrà aconseguir un millor aprofitament dels recursos i del consum energètic, ja que amb l’aritmètica de precisió reduïda es pot incrementar l’eficiència dels entrenaments amb xarxes neuronals. [ES] La inteligencia artificial (IA) se está convirtiendo en un elemento imprescindible en diferentes ámbitos de la informática. Al mismo tiempo que la IA se está desarrollando a escala de algoritmos, también las arquitecturas de procesamiento se están adaptando para dar un mejor apoyo. Por este motivo en el presente trabajo se desarrolla un soporte en aritmética en coma flotante de 16 bits sobre una plataforma de entrenamiento e inferencia de redes neuronales. Este desarrollo se realiza sobre el dispositivo Jetson AGX Xavier de la empresa NVIDIA, el cual está destinado a aplicaciones de inteligencia artificial, como el aprendizaje profundo (deep learning). El objetivo es dar a la aplicación HELENNA de un soporte que le permita utilizar nombres en coma flotante de 16 bits sobre la GPU del dispositivo de NVIDIA, a través del lenguaje de programación CUDA. De este modo se podrá conseguir un mejor aprovechamiento de los recursos y del consumo energético, puesto que con la aritmética de precisión reducida se puede incrementar la eficiencia de los entrenamientos con redes neuronales. [EN] Artificial Intelligence (AI) is becoming an important element in different areas of computing. At the same time that AI is developing at the algorithm scale, processing architectures are also adapting to give a better support to it. For this reason, in the present project a 16-bit floating point arithmetic support is developed on a neural network training and inference platform. This development is carried out on the Jetson AGX Xavier device from NVIDIA, which is destined for artificial intelligence applications, like deep learning. The objective is to give the HELENNA application support that allows it to use 16-bit floating point numbers on the GPU of the NVIDIA device, through the CUDA programming language. In this way, better use of resources and energy consumption can be achieved, since with reduced precision arithmetic, the efficiency of training with neural networks can be increased. Díaz-Cano Lozano, R. (2021). Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier. Universitat Politècnica de València. http://hdl.handle.net/10251/172214",
author = "Díaz-Cano Lozano, Roberto",
keywords = "16-bit floating point arithmetic ; Aprenentatge profund ; Aritmètica en coma flotant de 16 bits ; ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Artificial Intelligence ; CUDA ; Deep learning ; GPU ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; HELENNA ; Intel·ligència artificial ; Neuronal networks ; Xarxes neuronals",
language = "cat",
publisher = "Universitat Politècnica de València",
title = "Desenvolupament de processos d’entrenament i inferència amb aritmètica en coma flotant de 16 bits sobre la plataforma Jetson Xavier",
year = 2021
}

21. Pere Díaz Bou. Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento. 2021. BibTeX

@misc{DíazBouPere2021DdSd,
abstract = "[ES] La investigación en inteligencia artificial está creciendo a un ritmo constante y gracias a ello, se están resolviendo problemas que hasta hace poco parecían imposibles. Las Redes Neuronales son unos de los algoritmos mas usados para resolver estos problemas, y existen mas de un tipo de Red Neuronal. Cada una con cierta utilidad y complejidad. En este trabajo realizamos un análisis de las Redes Recurrentes. En concreto, analizamos su estructura y su funcionamiento. Posteriormente, implementaremos las Redes Recurrentes en HELENNA, una plataforma de entrenamiento e inferencia de Redes Neuronales. Para acabar, realizamos una evaluación funcional de la Red Recurrente para validar la correcta implementación. La implementación de la Red Neuronal Recurrentes será capaz de resolver problemas básicos como la predicción de dígitos escritos a mano con un porcentaje de precisión alto. [EN] Research in artificial intelligence is growing at incredible rates which allowed us to solve problems that seemed impossible before. One of the best known algorithms to solve artificial intelligence problems is neural networks. There are different types of neural networks, with different complexities and applications. Moreover, we will implement Recurrent Neural Networks in HELENNA, a training and inference plataform for Neural Networks. At last, we will evaluate the functional aspect of the Recurrent Neural Network to validate the correctness of it. Our recurrent neural network implementation will be able to predict with high accuracy handwritten digits. [CA] La recerca en intel·ligència artificial està creixent constantment i gràcies a això, ens ha permet resoldre problemes que abans pareixien impossibles. Les Xarxes Neuronals son uns dels algorismes mes usats per a resoldre aquests problemes. Hi ha mes de un tipus de Xarxa Neuronal, cada una usada per a diferents aplicacions i, amb diferents complexitats. A més a més, implementarem les Xarxes Recurrents en HELENNA, una plataforma d’entrenament i inferència de Xarxes Neuronals. Per finalitzar, farem una avaluació funcional de la Xarxa Recurrent per validar la correcta implementació. La implementació de la Xarxa Neuronal recurrent será capaç de resoldre problemes senzills com la predicció de dígits escrits a mà amb un alt nivell de precisió. Díaz Bou, P. (2021). Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento. Universitat Politècnica de València. http://hdl.handle.net/10251/173384",
author = "Díaz Bou, Pere",
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Artificial Intelligence ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Inteligencia artificial ; Plataforma de entrenamiento ; Recurrent Neural Networks ; Redes neuronales recurrentes ; Training Plataform",
language = "spa",
publisher = "Universitat Politècnica de València",
title = "Desarrollo de Soporte de Redes Recurrentes en Plataforma de Entrenamiento",
year = 2021
}

22. Jose Duro, Salvador Petit, Maria E Gomez and Julio Sahuquillo. Segment Switching: A New Switching Strategy for Optical HPC Networks. IEEE access 9:43095-43106, 2021. BibTeX

@article{DuroJose2021SSAN,
abstract = "Photonics are becoming realistic technologies for implementing interconnection networks in near future Exascale supercomputer systems. Photonics present key features to design high-performance and scalable supercomputer networks, such as higher bandwidth and lower latencies than their electronic supercomputer networks counterparts. Some research work is focused on conventional network topologies built with photonic technologies, with the aim of taking advantage of photonic characteristics. Nevertheless, these approaches fail in that they keep low the network utilization. We looked into this downside and we found that circuit switching was the main performance limitation. In this article we propose a new switching mechanism, called Segment Switching , to address this constraint and improve the network utilization. Segment Switching splits the circuit in segments of the whole path, and uses buffering on selected nodes on the network. Experimental results show that the devised approach significantly outperforms photonic circuit switching in conventional torus and fat tree networks by 70% and 90%, respectively.",
author = "Duro, Jose and Petit, Salvador and Gomez, Maria E and Sahuquillo, Julio",
issn = "2169-3536",
journal = "IEEE access",
keywords = "Bandwidth ; exascale supercomputers ; Integrated circuit interconnections ; Interconnection networks ; Optical buffering ; Optical switches ; photonic technology ; Photonics ; simulation ; Switching circuits ; Wavelength division multiplexing",
language = "eng",
pages = "43095-43106",
publisher = "IEEE",
title = "Segment Switching: A New Switching Strategy for Optical HPC Networks",
volume = 9,
year = 2021
}

23. Marta Navarro, Lucia Pons and Julio Sahuquillo. Hy-Sched: A Simple Hyperthreading-Aware Thread to Core Allocation Strategy. IEEE computer architecture letters 20(1):26-29, 2021. BibTeX

@article{NavarroMarta2021HASH,
abstract = "Simultaneous multithreading processors are dominating the High Computing Performance market. Among these processors, those supporting only two threads are being the most widely deployed in current systems, thus, only two threads compete at run-time for intra-core resources. The performance of these processors can be boosted by selecting symbiotic applications to be executed on the same core, which reduces the inter-application interference considerably. In this letter we propose Hy-Sched, an scheduling algorithm that exploits symbiosis to make pairs of applications to be launched on the same physical core. The proposed approach lies on the categories of the Top-Down Method for Performance Analysis. Different variants of the algorithm are explored. Experimental results show that Hy-Sched outperforms Linux on average by 15 percent in the studied workloads.",
author = "Navarro, Marta and Pons, Lucia and Sahuquillo, Julio",
issn = "1556-6056",
journal = "IEEE computer architecture letters",
keywords = "Benchmark testing ; Hardware ; Instruction sets ; Interference ; intra-core interference ; Linux ; Mathematical model ; Simultaneous multithreading ; Symbiosis ; symbiotic applications",
language = "eng",
number = 1,
pages = "26-29",
publisher = "IEEE",
volume = 20,
year = 2021
}

24. Josue Feliu, Ajeya Naithani, Julio Sahuquillo, Salvador Petit, Moinuddin K Qureshi and Lieven Eeckhout. VMT: Virtualized Multi-Threading for Accelerating Graph Workloads on Commodity Processors. IEEE transactions on computers, pages 1-1, 2021. BibTeX

@article{FeliuJosue2021VVMf,
author = "Feliu, Josue and Naithani, Ajeya and Sahuquillo, Julio and Petit, Salvador and Qureshi, Moinuddin K and Eeckhout, Lieven",
issn = "0018-9340",
journal = "IEEE transactions on computers",
keywords = "Architecture State ; Computer architecture ; Graph Workloads ; Hardware ; Instruction sets ; Message systems ; Multi-Threading ; Registers ; Software ; Switches ; Virtualization",
language = "eng",
pages = "1-1",
publisher = "IEEE",
title = "VMT: Virtualized Multi-Threading for Accelerating Graph Workloads on Commodity Processors",
year = 2021
}

25. José Duro Gómez. Photonic Interconnection Networks for Exascale Computers. 2021. BibTeX

@misc{DuroGómezJosé2021PINf,
author = "Duro Gómez, José",
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Computacion a exaescala ; Exascale Supercomputers ; Interconnection networks ; Marcos de simulación ; Optical networks ; Photonic networks ; Photonic Technology ; Redes de interconexión ; Redes fotónicas ; Redes ópticas ; Simulation frameworks ; Supercomputadores a exaescala ; Tecnología fotónica",
language = "eng",
publisher = "Universitat Politècnica de València",
title = "Photonic Interconnection Networks for Exascale Computers",
year = 2021
}

26. Dezhen Wu. Estudio de prestaciones de cargas de latencia crítica en sistemas SMT. 2021. BibTeX

@misc{WuDezhen2021Edpd,
author = "Wu, Dezhen",
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Cloud computing ; Computación en la nube ; Latencia crítica ; Latency-critical ; Máster Universitario en Ingeniería de Computadores y Redes-Màster Universitari en Enginyeria de Computadors i Xarxes ; Perf ; SMT ; TailBench",
language = "spa",
publisher = "Universitat Politècnica de València",
title = "Estudio de prestaciones de cargas de latencia crítica en sistemas SMT",
year = 2021
}

27. Miguel Antonio Avargues Gutiérrez. Análisis de requerimientos y diseño de un controlador de memoria principal no volátil. 2021. BibTeX

@misc{AvarguesGutiérrezMiguelAntonio2021Adry,
author = "Avargues Gutiérrez, Miguel Antonio",
keywords = "ARQUITECTURA Y TECNOLOGIA DE COMPUTADORES ; Controlador de memoria ; Gem5 ; Grado en Ingeniería Informática-Grau en Enginyeria Informàtica ; Main memory ; Memoria no volátil ; Memoria principal ; Memory controller ; Non-volatile memory ; NVMain ; NVRAM",
language = "spa",
publisher = "Universitat Politècnica de València",
title = "Análisis de requerimientos y diseño de un controlador de memoria principal no volátil",
year = 2021
}

28. Carlos Navarro, Josué Feliu, Salvador Petit, Maria E Gomez and Julio Sahuquillo. Bandwidth-Aware Dynamic Prefetch Configuration for IBM POWER8. IEEE Transactions on Parallel and Distributed Systems PP (99), 2020. BibTeX

@article{ 10.1109/tpds.2020.2982392,
author = "Navarro, Carlos and Feliu, Josu{\'e} and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
abstract = "Advanced hardware prefetch engines are being integrated in current high-performance processors. Prefetching can boost the performance of most applications, however, the induced bandwidth consumption can lead the system to a high contention for main memory bandwidth, which is a scarce resource in current multicores. In such a case, the system performance can be severely damaged. This work characterizes the applications’ behavior in an IBM POWER8 machine, which presents many prefetch settings,varying the bandwidth contention degree. The study reveals that the best prefetch setting for each application depends on the main memory bandwidth availability, that is, it depends on the co-running applications. Based on this study, we propose Bandwidth-AwarePrefetch Configuration (BAPC) a scalable adaptive prefetching algorithm that improves the performance of multi-program workloads. BAPC increases the performance of the applications in a 8%, 11%, and 12% for workload mixes composed of 6, 8, and 10 applications over the IBM POWER8 default configuration. In addition to performance, BAPC reduces bandwidth consumption in 39%, 42%, and 45%, respectively.",
journal = "IEEE Transactions on Parallel and Distributed Systems PP",
number = 99,
title = "{B}andwidth-{A}ware {D}ynamic {P}refetch {C}onfiguration for {IBM} {POWER}8",
year = 2020
}

29. Tomas Picornell, Carles Hernández, Jose Flich and Jose Duato. Enforcing Predictability of Many-cores with DCFNoC. IEEE Transactions on Computers, 2020. BibTeX

@article{ 10.1109/tc.2020.2987797,
author = "Picornell, Tomas and Hern{\'a}ndez, Carles and Flich, Jose and Duato, Jose",
abstract = "The ever need for higher performance forces industry to include technology based on multi-processors system on chip (MPSoCs) in their safety-critical embedded systems. MPSoCs include a network-on-chip (NoC) to interconnect the cores between them and with memory and the rest of shared resources. Unfortunately, the inclusion of NoCs compromises guaranteeing time predictability as network-level conflicts may occur. To overcome this problem, in this paper we propose DCFNoC, a new time-predictable NoC design paradigm where conflicts within the network are eliminated by design. This new paradigm builds on top of the Channel Dependency Graph (CDG) in order to deterministically avoid network conflicts. The network guarantees predictability to applications and is able to naturally inject messages using a TDM period equal to the optimal theoretical bound without the need of using a computationally demanding offline process. DCFNoC is integrated in a tile-based many-core system and adapted to its memory hierarchy. Our results show that DCFNoC guarantees time predictability avoiding network interference among multiple running applications. DCFNoC always guarantees performance and also improves wormhole performance in a 4 × 4 setting by a factor of 3.7× when interference traffic is injected. For a 8 × 8 network differences are even larger. In addition, DCFNoC obtains a total area saving of 10.79% over a standard wormhole implementation.",
journal = "IEEE Transactions on Computers",
title = "{E}nforcing {P}redictability of {M}any-cores with {DCFN}o{C}",
year = 2020
}

30. Miguel Gorgues and Jose Flich. A Low-Latency and Flexible TDM NoC for Strong Isolation in Security-Critical Systems. 2019 IEEE 13th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC), 2019. BibTeX

@article{ 10.1109/mcsoc.2019.00029,
author = "Gorgues, Miguel and Flich, Jose",
journal = "2019 IEEE 13th International Symposium on Embedded Multicore/Many-core Systems-on-Chip (MCSoC)",
title = "{A} {L}ow-{L}atency and {F}lexible {TDM} {N}o{C} for {S}trong {I}solation in {S}ecurity-{C}ritical {S}ystems",
year = 2019
}

31. Francisco Candel, Alejandro Valero, Salvador Petit and Julio Sahuquillo. An Aging-Aware GPU Register File Design Based on Data Redundancy. IEEE Transactions on Computers 1(68):4-20, 2019. BibTeX

@article{ 10.1109/tc.2018.2849376,
author = "Candel, Francisco and Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
abstract = "Nowadays, GPUs sit at the forefront of high-performance computing thanks to their massive computational capabilities. Internally, thousands of functional units, architected to be fed by large register files, fuel such a performance. At deep nanometer technologies, the SRAM memory cells that implement GPU register files are very sensitive to the Negative Bias Temperature Instability (NBTI) effect. NBTI ages cell transistors by degrading their threshold voltage $V_{th}$ over the lifetime of the GPU. This degradation, which manifests when a cell keeps the same logic value for a relatively long period of time, compromises the cell read stability and increases the transistor switching delay, which can lead to wrong read values and eventually exceed the processor cycle time, respectively, so resulting in faulty operation. This work proposes architectural mechanisms leveraging the redundancy of the data stored in GPU register files to attack NBTI aging. The proposed mechanisms are based on data compression, power gating, and register address rotation techniques. All these mechanisms working together balance the distribution of logic values stored in the cells along the execution time, reducing both the overall $V_{th}$ degradation and the increase in the transistor switching delays. Experimental results show that a conventional GPU register file suffers the worst case for NBTI, since a significant fraction of the cells maintain the same logic value during the entire application execution (i.e., a 100 percent ‘0’ and ‘1’ duty cycle distributions). On average, the proposal reduces these distributions by 58 and 68 percent, respectively, which translates into $V_{th}$ degradation savings by 54 and 62 percent, respectively.",
journal = "IEEE Transactions on Computers",
number = 68,
pages = "4-20",
title = "{A}n {A}ging-{A}ware {GPU} {R}egister {F}ile {D}esign {B}ased on {D}ata {R}edundancy",
volume = 1,
year = 2019
}

32. Jose Puche, Salvador Petit, Maria E Gomez and Julio Sahuquillo. An efficient cache flat storage organization for multithreaded workloads for low power processors. Future Generation Computer Systems, 2019. BibTeX

@article{ 10.1016/j.future.2019.11.024,
author = "Puche, Jose and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
journal = "Future Generation Computer Systems",
title = "{A}n efficient cache flat storage organization for multithreaded workloads for low power processors",
year = 2019
}

33. Tomas Picornell, Carles Hernández, Jose Duato and Jose Flich. DCFNoC: A Delayed Conflict-Free Time Division Multiplexing Network on Chip. 56th Annual Design Automation Conference 2019, 2019. BibTeX

@article{ 10.1145/3316781.3317794,
author = "Picornell, Tomas and Hern{\'a}ndez, Carles and Duato, Jose and Flich, Jose",
abstract = "The adoption of many-cores in safety-critical systems requires real-time capable networks on chip (NoC). In this paper we propose a new time-predictable NoC design paradigm where contention within the network is eliminated. This new paradigm builds on the Channel Dependency Graph (CDG) and guarantees by design the absence of contention. Our delayed conflict-free NoC (DCFNoC) is able to naturally inject messages using a TDM period equal to the optimal theoretical bound and without the need of using a computationally demanding offline process. Results show that DCFNoC guarantees time predictability with very low implementation cost.",
journal = "56th Annual Design Automation Conference 2019",
title = "{DCFN}o{C}: {A} {D}elayed {C}onflict-{F}ree {T}ime {D}ivision {M}ultiplexing {N}etwork on {C}hip",
year = 2019
}

34. Francisco Candel, Alejandro Valero, Salvador Petit and Julio Sahuquillo. Efficient Management of Cache Accesses to Boost GPGPU Memory Subsystem Performance. IEEE Transactions on Computers 10(68):1442-1454, 2019. BibTeX

@article{ 10.1109/tc.2019.2907591,
author = "Candel, Francisco and Valero, Alejandro and Petit, Salvador and Sahuquillo, Julio",
abstract = "To support the massive amount of memory accesses that GPGPU applications generate, GPU memory hierarchies are becoming more and more complex, and the Last Level Cache (LLC) size considerably increases each GPU generation. This paper shows that counter-intuitively, enlarging the LLC brings marginal performance gains in most applications. In other words, increasing the LLC size does not scale neither in performance nor energy consumption. We examine how LLC misses are managed in typical GPUs, and we find that in most cases the way LLC misses are managed are precisely the main performance limiter. This paper proposes a novel approach that addresses this shortcoming by leveraging a tiny additional Fetch and Replacement Cache-like structure (FRC) that stores control and coherence information of the incoming blocks until they are fetched from main memory. Then, the fetched blocks are swapped with the victim blocks (i.e., selected to be replaced) in the LLC, and the eviction of such victim blocks is performed from the FRC. This approach improves performance due to three main reasons: i) the lifetime of blocks being replaced is enlarged, ii) the main memory path is unclogged on long bursts of LLC misses, and iii) the average LLC miss latency is reduced. The proposal improves the LLC hit ratio, memory-level parallelism, and reduces the miss latency compared to much larger conventional caches. Moreover, this is achieved with reduced energy consumption and with much less area requirements. Experimental results show that the proposed FRC cache scales in performance with the number of GPU compute units and the LLC size, since, depending on the FRC size, performance improves ranging from 30% to 67% for a modern baseline GPU card, and from 32% to 118% for a larger GPU. In addition, energy consumption is reduced on average from 49% to 57% for the larger GPU. These benefits come with a small area increase (by 7.3%) over the LLC baseline.",
journal = "IEEE Transactions on Computers",
number = 68,
pages = "1442-1454",
title = "{E}fficient {M}anagement of {C}ache {A}ccesses to {B}oost {GPGPU} {M}emory {S}ubsystem {P}erformance",
volume = 10,
year = 2019
}

35. Jose Puche, Salvador Petit, Maria E Gomez and Julio Sahuquillo. FOS: a low-power cache organization for multicores. The Journal of Supercomputing 3s(75):1-32, 2019. BibTeX

@article{ 10.1007/s11227-019-02858-x,
author = "Puche, Jose and Petit, Salvador and Gomez, Maria E. and Sahuquillo, Julio",
abstract = "The cache hierarchy of current multicore processors typically consists of one or two levels of private caches per core and a large shared last-level cache. This approach incurs area and energy wasting due to oversizing the private cache space, data replication through the inclusive cache levels, as well as the use of highly set-associative caches. In this paper, we claim that although this is the commonly adopted approach, it presents important design issues that can be addressed by a more energy efficient organization. This work proposes Flat On-chip Storage (FOS), a novel cache organization that, aimed at addressing energy and area on low-power processors, resolves the mentioned issues. For this purpose, FOS combines L2 and L3 cache levels into a single one, organized as a flat space, and composed of a pool of private small cache slices. These slices are initially powered off to save energy, and they are powered on and assigned to cores provided that the system performance is expected to improve. To provide fast and uniform access from the private L1 caches to the FOS’s cache slices, multiple architectural challenges are overcome, which entails the design of a custom optical network-on-chip. Experimental results show that FOS achieves significant energy savings on both static and dynamic energy over conventional cache organizations with the same storage capacity. FOS static energy savings are as much as 60% over an electrically connected shared cache; these savings grow up to 75% compared to optically connected baselines. Moreover, despite deactivating part of the cache space, FOS achieves similar performance values as those achieved by conventional approaches.",
journal = "The Journal of Supercomputing",
number = 75,
pages = "1-32",
title = "{FOS}: a low-power cache organization for multicores",
volume = "3s",
year = 2019
}

36. Josué Feliu, Salvador Petit and Julio Sahuquillo. Thread Isolation to Improve Symbiotic Scheduling on SMT Multicore Processors. IEEE Transactions on Parallel and Distributed Systems PP (99), 2019. BibTeX

@article{ 10.1109/tpds.2019.2934955,
author = "Feliu, Josu{\'e} and Petit, Salvador and Sahuquillo, Julio",
journal = "IEEE Transactions on Parallel and Distributed Systems PP",
number = 99,
title = "{T}hread {I}solation to {I}mprove {S}ymbiotic {S}cheduling on {SMT} {M}ulticore {P}rocessors",
year = 2019
}

37. Clara Furió, Josué Feliu, Julio Sahuquillo, Salvador Petit and Jose Duro. A Workload Generator for Evaluating SMT Real-Time Systems. 2018 International Conference on High Performance Computing & Simulation (HPCS), 2018. BibTeX

@article{ 10.1109/hpcs.2018.00067,
author = "Furi{\'o}, Clara and Feliu, Josu{\'e} and Sahuquillo, Julio and Petit, Salvador and Duro, Jose",
journal = "2018 International Conference on High Performance Computing {\&} Simulation (HPCS)",
title = "{A} {W}orkload {G}enerator for {E}valuating {SMT} {R}eal-{T}ime {S}ystems",
year = 2018
}

38. Vicent Selfa, Julio Sahuquillo, Maria E Gomez and Crispín Gomez. Efficient selective multicore prefetching under limited memory bandwidth. Journal of Parallel and Distributed Computing (120), 2018. BibTeX

@article{ 10.1016/j.jpdc.2018.05.002,
author = "Selfa, Vicent and Sahuquillo, Julio and Gomez, Maria E. and Gomez, Crisp{\'i}n",
abstract = "Current multicore systems implement multiple hardware prefetchers to tolerate long main memory latencies. However, memory bandwidth is a scarce shared resource which becomes critical with the increasing core count. To deal with this fact, recent works have focused on adaptive prefetchers, which control the prefetcher aggressiveness to regulate the main memory bandwidth consumption. Nevertheless, in limited bandwidth machines or under memory-hungry workloads, keeping active the prefetcher can damage the system performance and increase energy consumption. This paper introduces selective prefetching, where individual prefetchers are activated or deactivated to improve both main memory energy and performance, and proposes ADP, a prefetcher that deactivates local prefetchers in some cores when they present low performance and co-runners need additional bandwidth. Based on heuristics, an individual prefetcher is reactivated when performance enhancements are foreseen. Compared to a state-of-the-art adaptive prefetcher, ADP provides both performance and energy enhancements in limited memory bandwidth.",
journal = "Journal of Parallel and Distributed Computing",
number = 120,
title = "{E}fficient selective multicore prefetching under limited memory bandwidth",
year = 2018
}

39. Jose Flich. Exploring Manycore Architectures for Next-Generation HPC Systems through the MANGO Approac. Microprocessors and Microsystems, 2018. BibTeX

@article{ 10.1016/j.micpro.2018.05.011,
author = "Flich, Jose",
abstract = "The Horizon 2020 MANGO project aims at exploring deeply heterogeneous accelerators for use in High-Performance Computing systems running multiple applications with different Quality of Service (QoS) levels. The main goal of the project is to exploit customization to adapt computing resources to reach the desired QoS. For this purpose, it explores different but interrelated mechanisms across the architecture and system software. In particular, in this paper we focus on the runtime resource management, the thermal management, and support provided for parallel programming, as well as introducing three applications on which the project foreground will be validated.",
journal = "Microprocessors and Microsystems",
title = "{E}xploring {M}anycore {A}rchitectures for {N}ext-{G}eneration {HPC} {S}ystems through the {MANGO} {A}pproac",
year = 2018
}

40. Francisco Candel, Salvador Petit, Alejandro Valero and Julio Sahuquillo. Improving GPU Cache Hierarchy Performance with a Fetch and Replacement Cache. The 24th International European Conference on Parallel and Distributed Computing, 2018. BibTeX

@article{ gpu,
author = "Candel, Francisco and Petit, Salvador and Valero, Alejandro and Sahuquillo, Julio",
abstract = "In the last few years, GPGPU computing has become one of the most popular computing paradigms in high-performance computers due to its excellent performance to power ratio. The memory requirements of GPGPU applications widely differ from the requirements of CPU counterparts. The amount of memory accesses is several orders of magnitude higher in GPU applications than in CPU applications, and they present disparate access patterns. Because of this fact, large and highly associative Last-Level Caches (LLCs) bring much lower performance gains in GPUs than in CPUs. This paper presents a novel approach to manage LLC misses that efficiently improves LLC hit ratio, memory-level parallelism, and miss latencies in GPU systems. The proposed approach leverages a small additional Fetch and Replacement Cache (FRC) that stores control and coherence information of incoming blocks until they are fetched from main memory. Then, fetched blocks are swapped with victim blocks to be replaced in the LLC. After that, the eviction of victim blocks is performed from the FRC. This management approach improves performance due to three main reasons: i) the lifetime of blocks being replaced is increased, ii) the main memory path is unclogged on long bursts of LLC misses, and iii) the average L2 miss delaying latency is reduced. Experimental results show that our proposal increases the performance (OPC) over 25% in most of the studied applications, reaching improvements up to 400% in some applications.",
journal = "The 24th International European Conference on Parallel and Distributed Computing",
title = "{I}mproving {GPU} {C}ache {H}ierarchy {P}erformance with a {F}etch and {R}eplacement {C}ache",
year = 2018
}

41. Francisco Candel, Julio Sahuquillo, Salvador Petit and Alejandro Valero. Improving GPU Cache Hierarchy Performance with a Fetch and Replacement Cache: 24th International Conference on Parallel and Distributed Computing. 24th International Conference on Parallel and Distributed Computing, 2018. BibTeX

@article{ 10.1007/978-3-319-96983-1_17,
author = "Candel, Francisco and Sahuquillo, Julio and Petit, Salvador and Valero, Alejandro",
journal = "24th International Conference on Parallel and Distributed Computing",
title = "{I}mproving {GPU} {C}ache {H}ierarchy {P}erformance with a {F}etch and {R}eplacement {C}ache: 24th {I}nternational {C}onference on {P}arallel and {D}istributed {C}omputing",
year = 2018
}

42. Lucía Pons, Vicent Selfa, Salvador Petit and Julio Sahuquillo. Improving System Turnaround Time with Intel CAT by Identifying LLC Critical Applications. Euro-Par 2018: Parallel Processing, pages 603-615, 2018. BibTeX

@article{ 10.1007/978-3-319-96983-1_43,
author = "Pons, Luc{\'i}a and Selfa, Vicent and Petit, Salvador and Sahuquillo, Julio",
journal = "Euro-Par 2018: Parallel Processing",
pages = "603-615",
title = "{I}mproving {S}ystem {T}urnaround {T}ime with {I}ntel {CAT} by {I}dentifying {LLC} {C}ritical {A}pplications",
year = 2018
}

43. Jose Duro, Salvador Petit and Julio Sahuquillo. Modeling and analysis of the performance of exascale photonic networks. Concurrency and Computation Practice and Experience (31), 2018. BibTeX

@article{ 10.1002/cpe.4773,
author = "Duro, Jose and Petit, Salvador and Sahuquillo, Julio",
abstract = "Photonics technology has become a promising and viable alternative for both on‐chip and off‐chip interconnection networks of future Exascale systems. Nevertheless, this technology is not mature enough yet in this context, so research efforts focusing on photonic networks are still required to achieve realistic suitable network implementations. In this regard, system‐level photonic network simulators can help guide designers to assess the multiple design choices. Most current research is done on electrical network simulators, whose components work widely different from photonics components. In this work, we summarize and compare the working behavior of both technologies which includes the use of optical routers, wavelength‐division multiplexing and circuit switching among others. After implementing them into a well‐known simulation framework, an extensive simulation study has been carried out using realistic photonic network configurations with synthetic and realistic traffic. Experimental results show that, compared to electrical networks, optical networks can reduce the execution time of the studied real workloads in almost one order of magnitude. Our study also reveals that the photonic configuration highly impacts on the network performance, being the bandwidth per channel and the message length the most important parameters.",
journal = "Concurrency and Computation Practice and Experience",
number = 31,
title = "{M}odeling and analysis of the performance of exascale photonic networks",
year = 2018
}

44. Jose Duro, Salvador Petit, Julio Sahuquillo and Maria E Gomez. Workload Characterization for Exascale Computing Networks. 2018 International Conference on High Performance Computing & Simulation (HPCS), 2018. BibTeX

@article{ 10.1109/hpcs.2018.00069,
author = "Duro, Jose and Petit, Salvador and Sahuquillo, Julio and Gomez, Maria E.",
journal = "2018 International Conference on High Performance Computing {\&} Simulation (HPCS)",
title = "{W}orkload {C}haracterization for {E}xascale {C}omputing {N}etworks",
year = 2018
}