Skip to content

chore(🤖): Bump NVIDIA/Megatron-LM to bed7dbd... (2025-05-21) #14563

chore(🤖): Bump NVIDIA/Megatron-LM to bed7dbd... (2025-05-21)

chore(🤖): Bump NVIDIA/Megatron-LM to bed7dbd... (2025-05-21) #14563

Workflow file for this run

# Copyright (c) 2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CICD NeMo
on:
schedule:
- cron: 0 0 * * *
pull_request:
branches:
- main
- r**
- weekly-bump*
types: [labeled]
workflow_dispatch:
inputs:
test_to_run:
required: false
default: all
type: string
description: Comma-separated list of tests to run. Use "all" to run the full test suite.
concurrency:
# group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}-${{ github.event_name }}
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
jobs:
pre-flight:
runs-on: ubuntu-latest
outputs:
test_to_run: ${{ steps.test_to_run.outputs.main }}
is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
components_to_run: ${{ steps.components_to_run.outputs.main }}
env:
TESTS_TO_RUN: ${{ inputs.test_to_run }}
EVENT_NAME: ${{ github.event_name }}
HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
steps:
- name: Checkout branch
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Select components to run
id: components_to_run
run: |
pip install -U pip
pip install git-python
if [[ "$EVENT_NAME" == "pull_request" ]]; then
python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
else
echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json
fi
components_to_run=$(cat test_modules.json)
echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
- name: Select tests to run
id: test_to_run
run: |
# For manual dispatch, we replace `all` with the actual job names
if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
TESTS_TO_RUN=$TESTS_TO_RUN
# For correctly labeled PR, we replace `all` with the actual job names
elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
TESTS_TO_RUN=all
# For incorrectly labeled PR, run no tests
elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
TESTS_TO_RUN=""
# For push events, run all tests. This is so that we can generate coverage
# on branch `main`.
elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
TESTS_TO_RUN=all
else
echo "Unsupported event_name $EVENT_NAME provided".
exit 1
fi
parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
- name: Check if this is a CI workload
shell: bash
id: is_ci_workload
run: |
branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
is_ci_workload=true
echo "main=true" | tee -a "$GITHUB_OUTPUT"
else
is_ci_workload=false
fi
echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
- name: Check if no-fail-fast is set
shell: bash
id: no_fail_fast
env:
HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
run: |
if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
no_fail_fast=true
else
no_fail_fast=false
fi
echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
code-linting:
if: needs.pre-flight.outputs.test_to_run != '[]'
needs: [pre-flight]
uses: ./.github/workflows/code-linting.yml
cicd-wait-in-queue:
needs: [pre-flight, code-linting]
runs-on: ubuntu-latest
environment: test
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
steps:
- name: Running CI tests
run: |
echo "Running CI tests"
cicd-test-container-build:
uses: ./.github/workflows/_build_container.yml
needs: [pre-flight, code-linting, cicd-wait-in-queue]
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
image-name: nemo_container
dockerfile: docker/Dockerfile.ci
cicd-import-tests:
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
needs: [cicd-test-container-build, pre-flight]
runs-on: self-hosted-azure-gpus-1
steps:
- name: Create UUID
id: uuid
run: |
echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
- name: Checkout NeMo
uses: actions/checkout@v2
with:
repository: NVIDIA/NeMo
path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
- name: Run some checks
run: |
docker run \
--rm \
--device=/dev/nvidia0 \
--gpus all \
--shm-size=8g \
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
# PyTorch Lightning version
python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
# PyTorch Lightning DDP Checks
CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
# Basic Import Checks
python tests/core_ptl/check_imports.py --domain asr
python tests/core_ptl/check_imports.py --domain nlp
python tests/core_ptl/check_imports.py --domain tts
'
L0_Setup_Test_Data_And_Models:
needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
runs-on: self-hosted-azure
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v4
with:
path: ${{ github.run_id }}
- name: main
uses: NVIDIA/NeMo/.github/actions/test-template@main
with:
runner: ${{ runner.name }}
script: L0_Setup_Test_Data_And_Models
tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
cicd-main-unit-tests:
needs: [pre-flight, cicd-test-container-build]
uses: ./.github/workflows/cicd-main-unit-tests.yml
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
cicd-main-export-deploy:
needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
uses: ./.github/workflows/cicd-main-export-deploy.yml
if: |
(
needs.pre-flight.outputs.test_to_run != '[]'
&& (
contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy')
)
)
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
cicd-main-speech:
needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
uses: ./.github/workflows/cicd-main-speech.yml
if: |
(
needs.pre-flight.outputs.test_to_run != '[]'
&& (
contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
)
)
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
cicd-main-automodel:
needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
uses: ./.github/workflows/cicd-main-automodel.yml
if: |
(
needs.pre-flight.outputs.test_to_run != '[]'
&& (
contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel')
)
)
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
cicd-main-nemo2:
needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
uses: ./.github/workflows/cicd-main-nemo2.yml
if: |
(
needs.pre-flight.outputs.test_to_run != '[]'
&& (
contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
|| needs.pre-flight.outputs.components_to_run == '["all"]'
)
)
&& (
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
)
)
&& !cancelled()
with:
test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
Nemo_CICD_Test:
needs:
- pre-flight
- cicd-test-container-build
- cicd-import-tests
- L0_Setup_Test_Data_And_Models
- cicd-main-unit-tests
- cicd-main-nemo2
- cicd-main-export-deploy
- cicd-main-automodel
- cicd-main-speech
if: always()
runs-on: ubuntu-latest
permissions: write-all
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Get workflow result
id: result
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
run: |
# Get workflow run details and check job conclusions
LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then
RESULT="success"
elif [[ $NUM_CANCELLED -gt 0 ]]; then
RESULT="cancelled"
else
RESULT="failure"
fi
# Output the final status
echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
- name: Checkout for GH CLI
uses: actions/checkout@v4
- name: Remove label if not cancelled
if: |
steps.result.outputs.code != 'cancelled'
&& github.event.label.name == 'Run CICD'
&& github.event.pull_request.head.repo.full_name == github.repository
env:
GH_TOKEN: ${{ github.token }}
PR_NUMBER: ${{ github.event.number }}
run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
- name: Pipeline successful, add PR comment
if: |
steps.result.outputs.code == 'success'
&& github.event_name == 'pull_request'
&& env.SLACK_WEBHOOK != ''
uses: peter-evans/create-or-update-comment@v4
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
with:
issue-number: ${{ github.event.number }}
body: |
[🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
So it might be time to merge this PR or get some approvals.
//cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
if: |
steps.result.outputs.code == 'failure'
&& github.event.label.name == 'Run CICD'
&& env.SLACK_WEBHOOK != ''
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
PR_NUMBER: ${{ github.event.number }}
SERVER_URL: ${{ github.server_url }}
run: |
set -x
pip install PyGithub
export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
python .github/scripts/notify.py
- name: Exit
if: ${{ always() }}
env:
RESULT: ${{ steps.result.outputs.code }}
run: |
if [ $RESULT == "success" ]; then
exit 0
else
exit 1
fi
Coverage:
runs-on: ubuntu-latest
needs: [pre-flight, Nemo_CICD_Test]
if: |
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
|| needs.Nemo_CICD_Test.result == 'success'
)
&& !cancelled()
strategy:
matrix:
flag: [unit-test, e2e]
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Download coverage reports of current branch
uses: actions/download-artifact@v4
with:
pattern: coverage-${{ matrix.flag }}-*
- name: Get total coverage of current branch
shell: bash -x -e -u -o pipefail {0}
if: always()
run: |
pip install coverage
ls -al .
ls -al coverage-*/
coverage combine --keep $(ls coverage-*/.coverage)
coverage report -i
rm -rf coverage-*
ls -al
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
verbose: true
flags: ${{ matrix.flag }}
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-${{ matrix.flag }}-aggregated
path: |
.coverage
include-hidden-files: true