opendal: binding/python: rust std fs is slower than python fs

Write up: https://xuanwo.io/2023/04-rust-std-fs-slower-than-python/ Reproduce: https://github.com/Xuanwo/when-i-find-rust-is-slow


:) hyperfine "python test_fs.py" "./opendal-test/target/release/opendal-test"
Benchmark 1: python test_fs.py
  Time (mean ± σ):      28.8 ms ±   1.9 ms    [User: 12.8 ms, System: 15.8 ms]
  Range (min … max):    26.2 ms …  36.3 ms    101 runs

Benchmark 2: ./opendal-test/target/release/opendal-test
  Time (mean ± σ):      28.0 ms ±   1.3 ms    [User: 0.3 ms, System: 27.7 ms]
  Range (min … max):    26.1 ms …  37.1 ms    95 runs

  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.

Summary
  ./opendal-test/target/release/opendal-test ran
    1.03 ± 0.08 times faster than python test_fs.py
hyperfine "python test_fs.py" "./opendal-test/target/release/opendal-test"  1.60s user 4.31s system 99% cpu 5.931 total

Python Code

import pathlib
import timeit

root = pathlib.Path(__file__).parent
filename = "file"

def read_file_with_normal() -> bytes:
    with open(root / filename, "rb") as fp:
        result = fp.read()
    return result

if __name__ == "__main__":
    read_file_with_normal()

Rust Code:

use std::io::Read;
use std::fs::OpenOptions;

fn main() {
    // let bs = op.read("file").unwrap();
    let mut bs = vec![0; 64 * 1024 * 1024];
    let mut f = OpenOptions::new().read(true).open("/tmp/test/file").unwrap();
    f.read_exact(&mut bs).unwrap();
    // let bs = std::fs::read("/tmp/test/file").unwrap();
    //assert_eq!(bs.len(), 64 * 1024 * 1024);
}

Why rust spent more time on syscall?

Discussion:

https://discord.com/channels/1081052318650339399/1174840499576770560

About this issue

  • Original URL
  • State: closed
  • Created 7 months ago
  • Comments: 20 (12 by maintainers)

Most upvoted comments

A friend shared a link with me: https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/2030515

Also in glibc upstream: https://sourceware.org/bugzilla/show_bug.cgi?id=30994

It looks like some AMD CPU doesn’t handle rep movsb (a.k.a FSRM) correctly.

I’m able to reproduce this easily using the C reproducer above, a quick test using perf stat:

With the 0x20 offset:

perf stat -d -d -d --repeat 20 ./a.out
 Performance counter stats for './a.out' (20 runs):

             15.39 msec task-clock                       #    0.937 CPUs utilized               ( +-  3.24% )
                 1      context-switches                 #   64.972 /sec                        ( +- 17.62% )
                 0      cpu-migrations                   #    0.000 /sec
               598      page-faults                      #   38.854 K/sec                       ( +-  0.06% )
        41,239,117      cycles                           #    2.679 GHz                         ( +-  1.95% )  (40.68%)
           547,465      stalled-cycles-frontend          #    1.33% frontend cycles idle        ( +-  3.43% )  (40.60%)
           413,657      stalled-cycles-backend           #    1.00% backend cycles idle         ( +- 20.37% )  (40.50%)
        37,009,429      instructions                     #    0.90  insn per cycle
                                                  #    0.01  stalled cycles per insn     ( +-  3.13% )  (40.43%)
         5,410,381      branches                         #  351.526 M/sec                       ( +-  3.24% )  (39.80%)
            34,649      branch-misses                    #    0.64% of all branches             ( +-  4.04% )  (39.94%)
        13,965,813      L1-dcache-loads                  #  907.393 M/sec                       ( +-  3.37% )  (39.44%)
         3,623,350      L1-dcache-load-misses            #   25.94% of all L1-dcache accesses   ( +-  3.56% )  (39.52%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses
           590,613      L1-icache-loads                  #   38.374 M/sec                       ( +-  3.39% )  (39.67%)
             1,995      L1-icache-load-misses            #    0.34% of all L1-icache accesses   ( +-  4.18% )  (39.67%)
            16,046      dTLB-loads                       #    1.043 M/sec                       ( +-  3.28% )  (39.78%)
            14,040      dTLB-load-misses                 #   87.50% of all dTLB cache accesses  ( +-  3.24% )  (39.78%)
                11      iTLB-loads                       #  714.697 /sec                        ( +- 29.56% )  (39.77%)
             3,657      iTLB-load-misses                 # 33245.45% of all iTLB cache accesses  ( +- 14.61% )  (40.30%)
           395,578      L1-dcache-prefetches             #   25.702 M/sec                       ( +-  3.34% )  (40.10%)
   <not supported>      L1-dcache-prefetch-misses

          0.016429 +- 0.000521 seconds time elapsed  ( +-  3.17% )

Without the offset:

perf stat -d -d -d --repeat 20 ./a.out
 Performance counter stats for './a.out' (20 runs):

             30.89 msec task-clock                       #    0.968 CPUs utilized               ( +-  1.35% )
                 0      context-switches                 #    0.000 /sec
                 0      cpu-migrations                   #    0.000 /sec
               598      page-faults                      #   19.362 K/sec                       ( +-  0.05% )
        90,321,344      cycles                           #    2.924 GHz                         ( +-  1.12% )  (40.76%)
           599,640      stalled-cycles-frontend          #    0.66% frontend cycles idle        ( +-  2.19% )  (42.11%)
           398,016      stalled-cycles-backend           #    0.44% backend cycles idle         ( +- 22.41% )  (41.88%)
        43,349,705      instructions                     #    0.48  insn per cycle
                                                  #    0.01  stalled cycles per insn     ( +-  1.32% )  (41.91%)
         7,526,819      branches                         #  243.701 M/sec                       ( +-  5.01% )  (41.22%)
            37,541      branch-misses                    #    0.50% of all branches             ( +-  4.62% )  (41.12%)
       127,845,213      L1-dcache-loads                  #    4.139 G/sec                       ( +-  1.14% )  (39.84%)
         3,172,628      L1-dcache-load-misses            #    2.48% of all L1-dcache accesses   ( +-  1.34% )  (38.46%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses
           654,651      L1-icache-loads                  #   21.196 M/sec                       ( +-  1.71% )  (38.72%)
             2,828      L1-icache-load-misses            #    0.43% of all L1-icache accesses   ( +-  2.35% )  (38.67%)
            15,615      dTLB-loads                       #  505.578 K/sec                       ( +-  1.28% )  (38.82%)
            12,825      dTLB-load-misses                 #   82.13% of all dTLB cache accesses  ( +-  1.15% )  (38.88%)
                16      iTLB-loads                       #  518.043 /sec                        ( +- 27.06% )  (38.82%)
             2,202      iTLB-load-misses                 # 13762.50% of all iTLB cache accesses  ( +- 23.62% )  (39.38%)
         1,843,493      L1-dcache-prefetches             #   59.688 M/sec                       ( +-  3.36% )  (39.40%)
   <not supported>      L1-dcache-prefetch-misses

          0.031915 +- 0.000419 seconds time elapsed  ( +-  1.31% )

You can see the interesting change is the L1-dcache-prefetches and L1-dcache-loads. My CPU is AMD Ryzen 9 5900HX

Hey all, I’ve previously found this issue at work and just saw the HN thread. It occurs because the microcode doesn’t know the physical memory addresses. To then still handle overlapping virtual memory correctly (I mean when 2 virtual pages point to the same physical page, for the lack of better terms) it only compares bits 11:0 to see whether the addresses overlap. So, if the source and destination are 1 page or more apart but have a small misalignment, this then causes it to guess incorrectly and use a slower copy loop, similar to when you memcpy() something only by 1 byte around.

I’ve tested it like this (Windows specific code): https://gist.github.com/lhecker/d11deb1974c6a814576ebcff5d62e935 And these are my latest results on an 7800X3D: https://gist.github.com/lhecker/e46d895013925a5b61a4dc7ccc38dd38 (I’ve had to increase the max_size due to the large L3 cache.)

A friend shared a link with me: https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/2030515

Also in glibc upstream: https://sourceware.org/bugzilla/show_bug.cgi?id=30994

It looks like some AMD CPU doesn’t handle rep movsb (a.k.a FSRM) correctly.

It seems that rep movsb performance poorly when DATA IS PAGE ALIGNED, and perform better when DATA IS NOT PAGE ALIGNED, this is very funny…

In my case, I tried adjusting the offset value, when offset&0xFFF is between 0x10 - 0x990, it performance well, otherwise, when (dst - src)&0xFFF is small (zero included!), the performance is bad.

And it seems not related to prefetch, I disabled L1 prefetch using wrmsr, and get the following result:

With the offset

root@localhost ~/projects# perf stat -d -d -d --repeat 20 ./a.out
 Performance counter stats for './a.out' (20 runs):

             20.46 msec task-clock                       #    0.894 CPUs utilized               ( +-  2.44% )
                 0      context-switches                 #    0.000 /sec
                 0      cpu-migrations                   #    0.000 /sec
               334      page-faults                      #   16.321 K/sec                       ( +- 11.18% )
        59,037,978      cycles                           #    2.885 GHz                         ( +-  2.22% )  (32.47%)
           156,615      stalled-cycles-frontend          #    0.27% frontend cycles idle        ( +-  3.48% )  (37.40%)
        17,389,202      stalled-cycles-backend           #   29.45% backend cycles idle         ( +- 20.17% )  (42.28%)
        28,135,791      instructions                     #    0.48  insn per cycle
                                                  #    0.62  stalled cycles per insn     ( +-  2.89% )  (47.45%)
         8,236,579      branches                         #  402.490 M/sec                       ( +-  2.51% )  (52.01%)
           951,529      branch-misses                    #   11.55% of all branches             ( +-  2.12% )  (52.67%)
        16,851,255      L1-dcache-loads                  #  823.456 M/sec                       ( +-  2.24% )  (51.89%)
         2,566,477      L1-dcache-load-misses            #   15.23% of all L1-dcache accesses   ( +-  2.17% )  (46.96%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses
        11,291,878      L1-icache-loads                  #  551.790 M/sec                       ( +-  4.63% )  (42.07%)
             7,620      L1-icache-load-misses            #    0.07% of all L1-icache accesses   ( +-  4.30% )  (36.90%)
            19,040      dTLB-loads                       #  930.411 K/sec                       ( +- 10.16% )  (31.91%)
            13,887      dTLB-load-misses                 #   72.94% of all dTLB cache accesses  ( +-  2.38% )  (31.27%)
                 2      iTLB-loads                       #   97.732 /sec                        ( +- 41.36% )  (31.29%)
               427      iTLB-load-misses                 # 21350.00% of all iTLB cache accesses  ( +-  2.84% )  (31.71%)
                 0      L1-dcache-prefetches             #    0.000 /sec                        (31.71%)
   <not supported>      L1-dcache-prefetch-misses

          0.022895 +- 0.000572 seconds time elapsed  ( +-  2.50% )

Without the offset

root@localhost ~/projects# perf stat -d -d -d --repeat 20 ./a.out
 Performance counter stats for './a.out' (20 runs):

             38.50 msec task-clock                       #    0.940 CPUs utilized               ( +-  1.45% )
                 0      context-switches                 #    0.000 /sec
                 0      cpu-migrations                   #    0.000 /sec
               381      page-faults                      #    9.897 K/sec                       ( +-  7.07% )
       114,122,791      cycles                           #    2.964 GHz                         ( +-  1.68% )  (33.74%)
           247,254      stalled-cycles-frontend          #    0.22% frontend cycles idle        ( +-  4.22% )  (36.21%)
        32,418,683      stalled-cycles-backend           #   28.41% backend cycles idle         ( +- 28.06% )  (38.64%)
        27,014,465      instructions                     #    0.24  insn per cycle
                                                  #    1.20  stalled cycles per insn     ( +-  2.03% )  (41.22%)
         7,050,880      branches                         #  183.150 M/sec                       ( +-  1.72% )  (43.17%)
           853,915      branch-misses                    #   12.11% of all branches             ( +-  1.48% )  (45.76%)
       125,288,858      L1-dcache-loads                  #    3.254 G/sec                       ( +-  1.02% )  (46.11%)
         2,861,451      L1-dcache-load-misses            #    2.28% of all L1-dcache accesses   ( +-  1.35% )  (46.22%)
   <not supported>      LLC-loads
   <not supported>      LLC-load-misses
        12,747,624      L1-icache-loads                  #  331.125 M/sec                       ( +-  2.90% )  (45.82%)
             9,829      L1-icache-load-misses            #    0.08% of all L1-icache accesses   ( +-  3.31% )  (43.25%)
            40,737      dTLB-loads                       #    1.058 M/sec                       ( +-  5.84% )  (40.87%)
            33,703      dTLB-load-misses                 #   82.73% of all dTLB cache accesses  ( +-  7.95% )  (38.29%)
                44      iTLB-loads                       #    1.143 K/sec                       ( +- 17.63% )  (35.69%)
               453      iTLB-load-misses                 # 1029.55% of all iTLB cache accesses  ( +- 13.03% )  (33.52%)
                 0      L1-dcache-prefetches             #    0.000 /sec                        (31.50%)
   <not supported>      L1-dcache-prefetch-misses

          0.040952 +- 0.000609 seconds time elapsed  ( +-  1.49% )

L1-dcache-prefetches is zero now, but L1-dcache-loads still goes much higher when there is no offset, and performance drop by a lot.

After a 24h test, I some conclusion below

First, This issue could not be reproduced on my machine. I have test from 64M to 40G RAM file. some of the results


----------64M--------------------
╰─ hyperfine "python3 demo.py" target/release/demo
Benchmark 1: python3 demo.py
  Time (mean ± σ):     132.7 ms ±  25.6 ms    [User: 66.4 ms, System: 66.3 ms]
  Range (min … max):    77.3 ms … 173.2 ms    36 runs

Benchmark 2: target/release/demo
  Time (mean ± σ):      16.8 ms ±   5.9 ms    [User: 0.8 ms, System: 16.3 ms]
  Range (min … max):     8.4 ms …  40.3 ms    204 runs
------------------------------------------------

----------------1G-------------------------
╰─ hyperfine "python3 demo.py" target/release/demo
Benchmark 1: python3 demo.py
  Time (mean ± σ):     132.7 ms ±  25.6 ms    [User: 66.4 ms, System: 66.3 ms]
  Range (min … max):    77.3 ms … 173.2 ms    36 runs

Benchmark 2: target/release/demo
  Time (mean ± σ):      16.8 ms ±   5.9 ms    [User: 0.8 ms, System: 16.3 ms]
  Range (min … max):     8.4 ms …  40.3 ms    204 runs
--------------------------------------------

-----------------10G------------------------
╰─ hyperfine "python3 demo.py" target/release/demo
Benchmark 1: python3 demo.py
  Time (mean ± σ):      1.693 s ±  0.018 s    [User: 0.033 s, System: 1.657 s]
  Range (min … max):    1.670 s …  1.731 s    10 runs
 
Benchmark 2: target/release/demo
  Time (mean ± σ):      1.662 s ±  0.026 s    [User: 0.000 s, System: 1.659 s]
  Range (min … max):    1.632 s …  1.694 s    10 runs
------------------------------------------------

-------------------------30G-----------------------
Benchmark 1: python3 demo.py
  Time (mean ± σ):      5.015 s ±  0.010 s    [User: 0.031 s, System: 4.974 s]
  Range (min … max):    5.000 s …  5.030 s    10 runs
 
Benchmark 2: target/release/demo
  Time (mean ± σ):      4.990 s ±  0.027 s    [User: 0.000 s, System: 4.981 s]
  Range (min … max):    4.960 s …  5.059 s    10 runs

-----------------------------------------------------------

--------------------40G----------------------------------
Benchmark 1: python3 demo.py
  Time (mean ± σ):      7.717 s ±  0.155 s    [User: 0.031 s, System: 7.626 s]
  Range (min … max):    7.547 s …  8.014 s    10 runs
 
Benchmark 2: target/release/demo
  Time (mean ± σ):      7.605 s ±  0.097 s    [User: 0.001 s, System: 7.569 s]
  Range (min … max):    7.517 s …  7.796 s    10 runs
 -------------------------------------------------------

I use eBPF + kprobe to measure the precise read syscall time for each problem

here’s result

--------------------Python----------------------
Process 31675 read file 10669758 ns
Process 31675 read file 623 ns

----------------Rust----------------------------
Process 33289 read file 10919920 ns

Here’s the script

from bcc import BPF
import time
import argparse

bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/fs.h>
#include <linux/sched.h>

BPF_HASH(fd_info, pid_t, int);
BPF_HASH(action_info, pid_t, u64);

struct event_data_t {
    u32 pid;
    u64 delta_ts;
};


BPF_RINGBUF_OUTPUT(events, 65536);

static __always_inline int matchPrefix(const char *cs, const char *ct, int size) {
    int len = 0;
    unsigned char c1, c2;
    for (len=0;len<(size & 0xff);len++) {
        c1 = *cs++;
        c2 = *ct++;
        if (c1 != c2) return c1 < c2 ? -1 : 1;
        if (!c1) break;
     }
     return 0;
}

int trace_openat_entry(struct pt_regs *ctx, int dfd, const char __user *filename, int flags, umode_t mode) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    if (matchPrefix(filename, "/tmp/demofile2", sizeof(filename)) != 0) {
        return 0;
    }
    fd_info.update(&tid, &dfd);

    return 0;
}

int trace_openat_return(struct pt_regs *ctx) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    int *dfd = fd_info.lookup(&tid);
    if (dfd == NULL) {
        return 0;
    }
    int ret = PT_REGS_RC(ctx);
    fd_info.update(&tid, &ret);
    return 0;
}

int trace_read_entry(struct pt_regs *ctx, int fd, char __user *buf, size_t count) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    int *dfd = fd_info.lookup(&tid);
    if (dfd == NULL) {
        return 0;
    }

    if (*dfd != fd) {
        return 0;
    }
    u64 ts = bpf_ktime_get_ns();
    action_info.update(&tid, &ts);

    return 0;
}

int trace_read_return(struct pt_regs *ctx, int ret) {
    u32 pid = bpf_get_current_pid_tgid() >> 32;
    u32 tid = bpf_get_current_pid_tgid();
    u64 *ts = action_info.lookup(&tid);
    if (ts == NULL) {
        return 0;
    }
    action_info.delete(&tid);
    struct event_data_t *event_data = events.ringbuf_reserve(sizeof(struct event_data_t));
    if (!event_data) {
        return 0;
    }
    event_data->pid = pid;
    event_data->delta_ts = bpf_ktime_get_ns() - *ts;
    events.ringbuf_submit(event_data, sizeof(event_data));
    return 0;
}
"""

bpf = BPF(text=bpf_text)

bpf.attach_kprobe(event="do_sys_openat2", fn_name="trace_openat_entry")
bpf.attach_kretprobe(event="do_sys_openat2", fn_name="trace_openat_return")
bpf.attach_kprobe(event="ksys_read", fn_name="trace_read_entry")
bpf.attach_kretprobe(event="ksys_read", fn_name="trace_read_return")

def process_event_data(cpu, data, size):
    event = bpf["events"].event(data)
    print(f"Process {event.pid} read file {event.delta_ts} ns")


bpf["events"].open_ring_buffer(process_event_data)

while True:
    try:
        bpf.ring_buffer_consume()
    except KeyboardInterrupt:
        exit()

Unfortunately, we can not run this script with zen-kenel 6.6.2 from upstream(Both @Xuanwo and me confirm this).

Next step, I will try to measure the precise time by using ptrace or userspace eBPF which is not depend on the kernel symbol.