longhorn: [BUG] test_autosalvage_with_data_locality_enabled failed in v1.3.1-rc2

Describe the bug

test_autosalvage_with_data_locality_enabled failed in v1.3.1-rc2

To Reproduce

Run the tests

Expected behavior

The tests should pass

Log or Support bundle

def test_autosalvage_with_data_locality_enabled(client, core_api, make_deployment_with_pvc, volume_name, pvc): # NOQA
        """
        This e2e test follows the manual test steps at:
        https://github.com/longhorn/longhorn/issues/2778#issue-939331805
    
        Preparation:
        1. Let's call the 3 nodes: node-1, node-2, node-3
    
        Steps:
        1. Add the tag `node-1` to `node-1`
        2. Create a volume with 1 replica, data-locality set to best-effort,
           and tag set to `node-1`
        3. Create PV/PVC from the volume.
        4. Create a pod that uses the PVC. Set node selector for the pod so that
           it will be schedule on to `node-2`. This makes sure that there is a
           failed-to-scheduled local replica
        5. Wait for the pod to be in running state.
        6. Kill the instance manager r on `node-1`.
        7. In a 3-min retry loop, verify that Longhorn salvage the volume
           and the workload pod is restarted. Exec into the workload pod.
           Verify that read/write to the volume is ok
        8. Exec into the longhorn manager pod on `node-2`.
           Running `ss -a -n | grep :8500 | wc -l` to find the number of socket
           connections from this manager pod to instance manager pods.
           In a 2-min loop, verify that the number of socket connection is <= 20
    
        Cleaning up:
        1. Clean up the node tag
        """
    
        # Step1
        nodes = client.list_node()
        assert len(nodes) == 3
        node_1, node_2, node_3 = nodes
        tags = ["node-1"]
        node_1 = common.set_node_tags(client, node_1, tags)
    
        # Step2
        client.create_volume(
            name=volume_name, size=SIZE, numberOfReplicas=1,
            nodeSelector=tags, dataLocality="best-effort"
            )
    
        volume = common.wait_for_volume_detached(client, volume_name)
        assert volume.nodeSelector == tags
    
        # Step3
        pvc_name = volume_name + "-pvc"
        create_pv_for_volume(client, core_api, volume, volume_name)
        create_pvc_for_volume(client, core_api, volume, pvc_name)
    
        # Step4
        deployment_name = volume_name + "-dep"
        deployment = make_deployment_with_pvc(deployment_name, pvc_name)
        deployment["spec"]["template"]["spec"]["nodeSelector"] \
            = {"kubernetes.io/hostname": node_2.name}
    
        # Step5
        apps_api = get_apps_api_client()
        create_and_wait_deployment(apps_api, deployment)
    
        pod_names = common.get_deployment_pod_names(core_api, deployment)
        test_data = generate_random_data(VOLUME_RWTEST_SIZE)
        write_pod_volume_data(core_api, pod_names[0], test_data, filename='test')
        create_snapshot(client, volume_name)
    
        # Step6
        labels = "longhorn.io/node={}\
                , longhorn.io/instance-manager-type=replica"\
                .format(node_1["name"])
    
        ret = core_api.list_namespaced_pod(
                namespace="longhorn-system", label_selector=labels)
        imr_name = ret.items[0].metadata.name
    
        delete_and_wait_pod(core_api, pod_name=imr_name,
                            namespace='longhorn-system')
    
        # Step7
        target_pod = \
            core_api.read_namespaced_pod(name=pod_names[0], namespace='default')
        wait_delete_pod(core_api, target_pod.metadata.uid)
        pod_names = common.get_deployment_pod_names(core_api, deployment)
        wait_pod(pod_names[0])
        time.sleep(120)
        command = 'cat /data/test'
>       verified_data = exec_command_in_pod(
            core_api, command, pod_names[0], 'default')

test_ha.py:2777: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
common.py:725: in exec_command_in_pod
    return stream(
/usr/local/lib/python3.9/site-packages/kubernetes/stream/stream.py:35: in stream
    return func(*args, **kwargs)
/usr/local/lib/python3.9/site-packages/kubernetes/client/api/core_v1_api.py:841: in connect_get_namespaced_pod_exec
    (data) = self.connect_get_namespaced_pod_exec_with_http_info(name, namespace, **kwargs)  # noqa: E501
/usr/local/lib/python3.9/site-packages/kubernetes/client/api/core_v1_api.py:927: in connect_get_namespaced_pod_exec_with_http_info
    return self.api_client.call_api(
/usr/local/lib/python3.9/site-packages/kubernetes/client/api_client.py:340: in call_api
    return self.__call_api(resource_path, method,
/usr/local/lib/python3.9/site-packages/kubernetes/client/api_client.py:172: in __call_api
    response_data = self.request(
/usr/local/lib/python3.9/site-packages/kubernetes/stream/stream.py:30: in _intercept_request_call
    return ws_client.websocket_call(config, *args, **kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

configuration = <kubernetes.client.configuration.Configuration object at 0x7fdac206cfd0>
args = ('GET', 'https://10.43.0.1:443/api/v1/namespaces/default/pods/longhorn-testvol-lxkk2o-dep-5959f48c45-rmqgt/exec')
kwargs = {'_preload_content': True, '_request_timeout': None, 'body': None, 'headers': {'Accept': '*/*', 'User-Agent': 'OpenAPI...nhjhQ8M95JSAbArtNrhJkqMNNlmBMeXT77kAHQy1P8usHG276LhvlD2ojDpaWNrFsILP38gw3uceSzKt7gZiCxYDuYswTV4olFuJvPwoR1iiXUQ'}, ...}
url = 'https://10.43.0.1:443/api/v1/namespaces/default/pods/longhorn-testvol-lxkk2o-dep-5959f48c45-rmqgt/exec?command=%2Fbin%2Fsh&command=-c&command=cat+%2Fdata%2Ftest&stderr=True&stdin=False&stdout=True&tty=False'
_request_timeout = None, _preload_content = True, capture_all = True
headers = {'Accept': '*/*', 'User-Agent': 'OpenAPI-Generator/11.0.0/python', 'authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZC...aRGrsxnhjhQ8M95JSAbArtNrhJkqMNNlmBMeXT77kAHQy1P8usHG276LhvlD2ojDpaWNrFsILP38gw3uceSzKt7gZiCxYDuYswTV4olFuJvPwoR1iiXUQ'}
query_params = [('command', '/bin/sh'), ('command', '-c'), ('command', 'cat /data/test'), ('stderr', True), ('stdin', False), ('stdout', True), ...]
key = 'tty', value = False, command = 'cat /data/test'

    def websocket_call(configuration, *args, **kwargs):
        """An internal function to be called in api-client when a websocket
        connection is required. args and kwargs are the parameters of
        apiClient.request method."""
    
        url = args[1]
        _request_timeout = kwargs.get("_request_timeout", 60)
        _preload_content = kwargs.get("_preload_content", True)
        capture_all = kwargs.get("capture_all", True)
        headers = kwargs.get("headers")
    
        # Expand command parameter list to indivitual command params
        query_params = []
        for key, value in kwargs.get("query_params", {}):
            if key == 'command' and isinstance(value, list):
                for command in value:
                    query_params.append((key, command))
            else:
                query_params.append((key, value))
    
        if query_params:
            url += '?' + urlencode(query_params)
    
        try:
            client = WSClient(configuration, get_websocket_url(url), headers, capture_all)
            if not _preload_content:
                return client
            client.run_forever(timeout=_request_timeout)
            return WSResponse('%s' % ''.join(client.read_all()))
        except (Exception, KeyboardInterrupt, SystemExit) as e:
>           raise ApiException(status=0, reason=str(e))
E           kubernetes.client.rest.ApiException: (0)
E           Reason: Handshake status 404 Not Found

Environment

v1.3.1-rc2

About this issue

  • Original URL
  • State: open
  • Created 2 years ago
  • Comments: 20 (20 by maintainers)

Most upvoted comments

@innobead

I’ve confirmed (2) AssertionError: assert ‘g14pwwmlwrvt…50dyh92vef1jg’ == ‘’ is caused by data isn’t flushed to the disk. In the same test scenario, if I execute sync after echo data to the file, this issue will not present anymore. In our automated test case, it does almost the same thing by generating 512 Bytes random text and echo the generated data to the file, so it could suffer from the flush issue too.

So (1) kubernetes.client.rest.ApiException: (0) Reason: Handshake status 500 Internal Server Error and (2) AssertionError: assert ‘g14pwwmlwrvt…50dyh92vef1jg’ == ‘’ are both caused by fragile test case implementation.

After modified the test code to (1) wait for a stable running pod and (2) using dd to write large size test data instead of using echo, and rerun the test case for 30 times, the only issue remains is (3) AssertionError: assert ‘qpmr8ujdg6jt…ik5c1i8az7vb8’ == ‘cat: read error: I/O error\n’:

https://ci.longhorn.io/job/private/job/longhorn-tests-regression/2289/ https://ci.longhorn.io/job/private/job/longhorn-tests-regression/2290/ https://ci.longhorn.io/job/private/job/longhorn-tests-regression/2291/

This test case test_autosalvage_with_data_locality_enabled could fail on all longhorn branches (master, v1.3.x, v1.2.x) and there are 3 different kind of failed reasons:

(1) kubernetes.client.rest.ApiException: (0) Reason: Handshake status 500 Internal Server Error:

It is because the test code tries to execute commands in a not-running pod. The test code definitely already has logic to wait the old pod being deleted and tries to get the new pod, but this error still happens. It seems because the pod deleting/creating will occur more than one time like the following logs. The old pod b8d22 will be deleted and the new pod bbp9c will be created, but somehow the new pod bbp9c will not become running state and will be deleted too, and there will be a third pod xpdcl being created and becoming running state, if the test code uses the second pod bbp9c to execute commands instead of the third pod xpdcl, it will encounter errors for sure.

NAME                                           READY   STATUS    RESTARTS   AGE
test-deployment-57f5668b89-b8d22               1/1     Running   0          59s
test-deployment-57f5668b89-b8d22               1/1     Terminating   0          82s
test-deployment-57f5668b89-bbp9c               0/1     Pending       0          0s
test-deployment-57f5668b89-bbp9c               0/1     Pending       0          0s
test-deployment-57f5668b89-bbp9c               0/1     ContainerCreating   0          0s
test-deployment-57f5668b89-b8d22               0/1     Terminating         0          82s
test-deployment-57f5668b89-b8d22               0/1     Terminating         0          83s
test-deployment-57f5668b89-b8d22               0/1     Terminating         0          83s
test-deployment-57f5668b89-bbp9c               0/1     Terminating         0          60s
test-deployment-57f5668b89-xpdcl               0/1     Pending             0          0s
test-deployment-57f5668b89-xpdcl               0/1     Pending             0          0s
test-deployment-57f5668b89-xpdcl               0/1     ContainerCreating   0          0s
test-deployment-57f5668b89-xpdcl               1/1     Running             0          6s
test-deployment-57f5668b89-bbp9c               1/1     Terminating         0          66s
test-deployment-57f5668b89-bbp9c               0/1     Terminating         0          67s
test-deployment-57f5668b89-bbp9c               0/1     Terminating         0          67s
test-deployment-57f5668b89-bbp9c               0/1     Terminating         0          67s

This kind of error happens on all longhorn branches (master, v1.3.x, v1.2.x) since it is due to the problematic test code implementation.

(2) AssertionError: assert ‘g14pwwmlwrvt…50dyh92vef1jg’ == ‘’:

This is because data loss in the newly created pod. This can be reproduced by manually executing the test case. In step (10), although the file still exists, the file size becomes 0 and cat this file outputs nothing like the following log:

In the old pod:

kubectl exec -it test-deployment-57f5668b89-cclzh -- /bin/sh
/ # cd data
/data # ls
lost+found
/data # echo ok > ok
/data # cat ok
ok
/data # ls -al
total 28
drwxr-xr-x    3 root     root          4096 Oct 31 03:51 .
drwxr-xr-x    1 root     root          4096 Oct 31 03:51 ..
drwx------    2 root     root         16384 Oct 31 03:51 lost+found
-rw-r--r--    1 root     root             3 Oct 31 03:51 ok

In the newly created pod:

kubectl exec -it test-deployment-57f5668b89-lr4gf -- /bin/sh
/ # cd data
/data # ls
lost+found  ok
/data # cat ok
/data # ls -al
total 24
drwxr-xr-x    3 root     root          4096 Oct 31 03:51 .
drwxr-xr-x    1 root     root          4096 Oct 31 03:53 ..
drwx------    2 root     root         16384 Oct 31 03:51 lost+found
-rw-r--r--    1 root     root             0 Oct 31 03:51 ok

This kind of error happens on all longhorn branches (master, v1.3.x, v1.2.x) too, and already exists for a long time, we didn’t catch this before because this error is shielded by (1), since (1) will happen first and fail the test case, so the chance we see this error would be reduced.

reference: master, v1.3.x, v1.2.x

(3) AssertionError: assert ‘qpmr8ujdg6jt…ik5c1i8az7vb8’ == ‘cat: read error: I/O error\n’:

Just like @chriscchien described in the comment.

This seems like a new kind of error only happens after 10/13, since it’s first observed in master branch amd64 regression test built on 10/13, it isn’t observed before 10/13.

And it probably doesn’t happen in v1.2.6-rc1, but will happen in v1.2.x-head now.

And not sure it will happen in v1.3.x, at least didn’t observed yet.

cc @innobead

Thanks @yangchiu and @chriscchien for great testing.

I can see (3) AssertionError: assert 'qpmr8ujdg6jt...ik5c1i8az7vb8' == 'cat: read error: I/O error\n' now in @chriscchien env. Looks like the kernel remount the filesystem as RO. Some users reported this in the past and now we can reproduce it. I will continue this one after AD controller refactoring. Output of dmesg:

[Tue Nov  1 03:31:08 2022] blk_update_request: I/O error, dev sda, sector 263176 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
[Tue Nov  1 03:31:08 2022] EXT4-fs warning (device sda): ext4_end_bio:344: I/O error 10 writing to inode 12 starting block 32898)
[Tue Nov  1 03:31:08 2022] Buffer I/O error on device sda, logical block 32897
[Tue Nov  1 03:31:08 2022] blk_update_request: I/O error, dev sda, sector 1048696 op 0x1:(WRITE) flags 0x800 phys_seg 4 prio class 0
[Tue Nov  1 03:31:08 2022] Aborting journal on device sda-8.
[Tue Nov  1 03:31:08 2022] blk_update_request: I/O error, dev sda, sector 1048576 op 0x1:(WRITE) flags 0x20800 phys_seg 1 prio class 0
[Tue Nov  1 03:31:08 2022] Buffer I/O error on dev sda, logical block 131072, lost sync page write
[Tue Nov  1 03:31:08 2022] JBD2: Error -5 detected when updating journal superblock for sda-8.
[Tue Nov  1 03:31:08 2022] sd 2:0:0:1: [sda] Synchronizing SCSI cache
[Tue Nov  1 03:32:12 2022] scsi host3: iSCSI Initiator over TCP/IP
[Tue Nov  1 03:32:12 2022] scsi 3:0:0:0: RAID              IET      Controller       0001 PQ: 0 ANSI: 5
[Tue Nov  1 03:32:12 2022] scsi 3:0:0:0: Attached scsi generic sg0 type 12
[Tue Nov  1 03:32:12 2022] scsi 3:0:0:1: Direct-Access     IET      VIRTUAL-DISK     0001 PQ: 0 ANSI: 5
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: Power-on or device reset occurred
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: Attached scsi generic sg1 type 0
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: [sdb] 2097152 512-byte logical blocks: (1.07 GB/1.00 GiB)
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: [sdb] Write Protect is off
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: [sdb] Mode Sense: 69 00 10 08
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: [sdb] Write cache: enabled, read cache: enabled, supports DPO and FUA
[Tue Nov  1 03:32:12 2022] sd 3:0:0:1: [sdb] Attached SCSI disk
[Tue Nov  1 03:32:35 2022] EXT4-fs error (device sda): ext4_journal_check_start:83: comm longhorn-manage: Detected aborted journal
[Tue Nov  1 03:32:35 2022] EXT4-fs (sda): Remounting filesystem read-only

Hi @PhanLe1010 ,

As we discussed before, theInput/output error can also be reproduced on Longhorn v1.2.5, v1.3.1 and v.1.3.2 with k3s version v1.24.7+k3s1, thank you.

Hi @weizhe0422 , If you don’t mind, I will take this issue to fix flaky test script, thank you.

I tried the @chriscchien’s steps and:

  1. kubernetes.client.rest.ApiException: (0) Reason: Handshake status 500 Internal Server Error: I can see this one. This is because the Longhorn manager deleted the workload pods multiple times. We can improve this area later but it is not critical for now. For now, we can modify the test to get the latest pod instead.

  2. AssertionError: assert 'g14pwwmlwrvt...50dyh92vef1jg' == '': I cannot reproduce this problem

  3. AssertionError: assert 'qpmr8ujdg6jt...ik5c1i8az7vb8' == 'cat: read error: I/O error\n': I cannot reproduce this problem

@PhanLe1010 is checking now, let @PhanLe1010 to follow up instead. Thanks.

@yangchiu this is a good analysis 👍

This test case test_autosalvage_with_data_locality_enabled could fail on all longhorn branches (master, v1.3.x, v1.2.x) and there are 3 different kind of failed reasons:

(1) kubernetes.client.rest.ApiException: (0) Reason: Handshake status 500 Internal Server Error:

It is because the test code tries to execute commands in a not-running pod. The test code definitely already has logic to wait the old pod being deleted and tries to get the new pod, but this error still happens. It seems because the pod deleting/creating will occur more than one time like the following logs. The old pod b8d22 will be deleted and the new pod bbp9c will be created, but somehow the new pod bbp9c will not become running state and will be deleted too, and there will be a third pod xpdcl being created and becoming running state, if the test code uses the second pod bbp9c to execute commands instead of the third pod xpdcl, it will encounter errors for sure.

This seems just related to the Kubernetes scheduler, so is there a way to really ensure the deployment pod is really ready before operating? (I feel we can benefit from this fix to ensure our test cases more stable)

(2) AssertionError: assert ‘g14pwwmlwrvt…50dyh92vef1jg’ == ‘’:

This is because data loss in the newly created pod. This can be reproduced by manually executing the test case. In step (10), although the file still exists, the file size becomes 0 and cat this file outputs nothing like the following log:

This kind of error happens on all longhorn branches (master, v1.3.x, v1.2.x) too, and already exists for a long time, we didn’t catch this before because this error is shielded by (1), since (1) will happen first and fail the test case, so the chance we see this error would be reduced.

It seems like write fs buffer was not flushed to the disk. Do you know if the related function really flush the data by using sync? Probably give it a try to update the related test function like write_pod_volume_data.

(3) AssertionError: assert ‘qpmr8ujdg6jt…ik5c1i8az7vb8’ == ‘cat: read error: I/O error\n’:

Just like @chriscchien described in the comment.

This seems like a new kind of error only happens after 10/13, since it’s first observed in master branch amd64 regression test built on 10/13, it isn’t observed before 10/13.

And it probably doesn’t happen in v1.2.6-rc1, but will happen in v1.2.x-head now.

This is back to the original issue @chriscchien found. @c3y1huang could you help check this one? @chriscchien provided the log as well and he can reproduce it manually.

Thanks @yangchiu for the analysis. Looping in @c3y1huang here.

Here is the instance-manager-r in node-1

  1. After manual delete instamce-manager-r on node-1
time="2022-10-19T12:34:26Z" level=info msg="Storing process logs at path: /var/log/instances"
--
Wed, Oct 19 2022 8:34:26 pm | [longhorn-instance-manager] time="2022-10-19T12:34:26Z" level=warning msg="Failed to addd TLS key pair from /tls-files/: open /tls-files/tls.crt: no such file or directory"
Wed, Oct 19 2022 8:34:26 pm | [longhorn-instance-manager] time="2022-10-19T12:34:26Z" level=info msg="Creating gRPC server with no auth"
Wed, Oct 19 2022 8:34:26 pm | [longhorn-instance-manager] time="2022-10-19T12:34:26Z" level=info msg="Instance Manager proxy gRPC server listening to 0.0.0.0:8501"
Wed, Oct 19 2022 8:34:26 pm | [longhorn-instance-manager] time="2022-10-19T12:34:26Z" level=info msg="Instance Manager process gRPC server listening to 0.0.0.0:8500"
Wed, Oct 19 2022 8:34:26 pm | [longhorn-instance-manager] time="2022-10-19T12:34:26Z" level=debug msg="started new process manager update watch"
Wed, Oct 19 2022 8:34:27 pm | [longhorn-instance-manager] time="2022-10-19T12:34:27Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:34:27 pm | [longhorn-instance-manager] time="2022-10-19T12:34:27Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:34:33 pm | [longhorn-instance-manager] time="2022-10-19T12:34:33Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:35:03 pm | [longhorn-instance-manager] time="2022-10-19T12:35:03Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:35:27 pm | [longhorn-instance-manager] time="2022-10-19T12:35:27Z" level=info msg="Process Manager: prepare to create process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:35:27 pm | [longhorn-instance-manager] time="2022-10-19T12:35:27Z" level=debug msg="Process Manager: validate process path: /host/var/lib/longhorn/engine-binaries/longhornio-longhorn-engine-master-head/longhorn dir: /host/var/lib/longhorn/engine-binaries/ image: longhornio-longhorn-engine-master-head binary: longhorn"
Wed, Oct 19 2022 8:35:27 pm | [longhorn-instance-manager] time="2022-10-19T12:35:27Z" level=info msg="Process Manager: created process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:35:27 pm | [vol2-r-ab4f9873] time="2022-10-19T12:35:27Z" level=info msg="Listening on gRPC Replica server 0.0.0.0:10000"
Wed, Oct 19 2022 8:35:27 pm | [vol2-r-ab4f9873] time="2022-10-19T12:35:27Z" level=info msg="Listening on data server 0.0.0.0:10001"
Wed, Oct 19 2022 8:35:27 pm | time="2022-10-19T12:35:27Z" level=info msg="Listening on sync agent server 0.0.0.0:10002"
Wed, Oct 19 2022 8:35:27 pm | [vol2-r-ab4f9873] time="2022-10-19T12:35:27Z" level=info msg="Listening on sync 0.0.0.0:10002"
Wed, Oct 19 2022 8:35:27 pm | [longhorn-instance-manager] time="2022-10-19T12:35:27Z" level=info msg="Process vol2-r-ab4f9873 has started at localhost:10000"
Wed, Oct 19 2022 8:35:28 pm | [vol2-r-ab4f9873] time="2022-10-19T12:35:28Z" level=info msg="New connection from: 10.42.3.12:47124"
Wed, Oct 19 2022 8:35:28 pm | [vol2-r-ab4f9873] time="2022-10-19T12:35:28Z" level=info msg="Opening volume /host/var/lib/longhorn/replicas/vol2-7bce68d0, size 1073741824/512"
  1. Continue in step 1, scale down/up deployment let volume reattach and the IO error gone
[vol2-r-ab4f9873] time="2022-10-19T12:39:45Z" level=info msg="Closing volume"
--
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=debug msg="Process Manager: deleted process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=debug msg="Process Manager: wait for process vol2-r-ab4f9873 to shutdown before unregistering process"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=debug msg="Process Manager: trying to stop process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=info msg="wait for process vol2-r-ab4f9873 to shutdown"
Wed, Oct 19 2022 8:39:45 pm | [vol2-r-ab4f9873] time="2022-10-19T12:39:45Z" level=warning msg="Received signal interrupt to shutdown"
Wed, Oct 19 2022 8:39:45 pm | time="2022-10-19T12:39:45Z" level=warning msg="Starting to execute registered shutdown func github.com/longhorn/longhorn-engine/app/cmd.startReplica.func4"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=info msg="Process Manager: process vol2-r-ab4f9873 stopped"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=info msg="Process Manager: successfully unregistered process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:39:45 pm | [longhorn-instance-manager] time="2022-10-19T12:39:45Z" level=debug msg="Process Manager: prepare to delete process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:40:18 pm | [longhorn-instance-manager] time="2022-10-19T12:40:18Z" level=info msg="Process Manager: prepare to create process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:40:18 pm | [longhorn-instance-manager] time="2022-10-19T12:40:18Z" level=debug msg="Process Manager: validate process path: /host/var/lib/longhorn/engine-binaries/longhornio-longhorn-engine-master-head/longhorn dir: /host/var/lib/longhorn/engine-binaries/ image: longhornio-longhorn-engine-master-head binary: longhorn"
Wed, Oct 19 2022 8:40:18 pm | [longhorn-instance-manager] time="2022-10-19T12:40:18Z" level=info msg="Process Manager: created process vol2-r-ab4f9873"
Wed, Oct 19 2022 8:40:18 pm | [vol2-r-ab4f9873] time="2022-10-19T12:40:18Z" level=info msg="Listening on data server 0.0.0.0:10001"
Wed, Oct 19 2022 8:40:18 pm | time="2022-10-19T12:40:18Z" level=info msg="Listening on sync agent server 0.0.0.0:10002"
Wed, Oct 19 2022 8:40:18 pm | time="2022-10-19T12:40:18Z" level=info msg="Listening on gRPC Replica server 0.0.0.0:10000"
Wed, Oct 19 2022 8:40:18 pm | [vol2-r-ab4f9873] time="2022-10-19T12:40:18Z" level=info msg="Listening on sync 0.0.0.0:10002"
Wed, Oct 19 2022 8:40:18 pm | [longhorn-instance-manager] time="2022-10-19T12:40:18Z" level=info msg="Process vol2-r-ab4f9873 has started at localhost:10000"
Wed, Oct 19 2022 8:40:19 pm | [vol2-r-ab4f9873] time="2022-10-19T12:40:19Z" level=info msg="New connection from: 10.42.3.12:59000"
Wed, Oct 19 2022 8:40:19 pm | [vol2-r-ab4f9873] time="2022-10-19T12:40:19Z" level=info msg="Opening volume /host/var/lib/longhorn/replicas/vol2-7bce68d0, size 1073741824/512"
Wed, Oct 19 2022 8:40:27 pm | [vol2-r-ab4f9873] time="2022-10-19T12:40:27Z" level=info msg="Reloading the revision counter before processing the first write, the current revision cache is 79, the latest revision counter in file is 79"

They all have message Opening volume /host/var/lib/longhorn/replicas/vol2-7bce68d0 but in step 2 have additional message Reloading the revision counter before processing the first write, the current revision cache is 79, the latest revision counter in file is 79" and IO work properly instance-manager-r-5e8d090b_replica-manager (1).log

It’s a flaky issue. 5 tests and 4 passed. https://ci.longhorn.io/job/private/job/longhorn-tests-regression/1831/ ~ https://ci.longhorn.io/job/private/job/longhorn-tests-regression/1835/

In the failed test, the pod becomes running, but exec_command_in_pod causes the error.

    target_pod = \
        core_api.read_namespaced_pod(name=pod_names[0], namespace='default')
    wait_delete_pod(core_api, target_pod.metadata.uid)
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    wait_pod(pod_names[0])

    command = 'cat /data/test'
    verified_data = exec_command_in_pod(
        core_api, command, pod_names[0], 'default')

Found the following error. The pod is deleted because the volume requests remount, so the exec_command_in_pod should not work and get failed. We should access the newly created pod rather than the deleted one.

time="2022-08-09T06:41:46Z" level=error msg="unable to schedule replica of volume with HardNodeAffinity = rancher50-worker2" accessMode=rwo controller=longhorn-volume frontend=blockdev migratable=false node=rancher50-worker2 owner=rancher50-worker2 replica=longhorn-testvol-4w0ep9-r-6cfba3f9 state=attached volume=longhorn-testvol-4w0ep9
time="2022-08-09T06:41:46Z" level=info msg="Deleted pod longhorn-testvol-4w0ep9-dep-794996f758-6b5mk so that Kubernetes will handle remounting volume longhorn-testvol-4w0ep9" controller=longhorn-kubernetes-pod node=rancher50-worker2
time="2022-08-09T06:41:46Z" level=error msg="There's no available disk for replica longhorn-testvol-4w0ep9-r-6cfba3f9, size 16777216"

It’s a flaky issue. 5 tests and 4 passed. https://ci.longhorn.io/job/private/job/longhorn-tests-regression/1831/ ~ https://ci.longhorn.io/job/private/job/longhorn-tests-regression/1835/

In the failed test, the pod becomes running, but exec_command_in_pod causes the error.

    target_pod = \
        core_api.read_namespaced_pod(name=pod_names[0], namespace='default')
    wait_delete_pod(core_api, target_pod.metadata.uid)
    pod_names = common.get_deployment_pod_names(core_api, deployment)
    wait_pod(pod_names[0])

    command = 'cat /data/test'
    verified_data = exec_command_in_pod(
        core_api, command, pod_names[0], 'default')