18c16567dSChristoph Hellwig // SPDX-License-Identifier: GPL-2.0
200e04393SOmar Sandoval /*
300e04393SOmar Sandoval * The Kyber I/O scheduler. Controls latency by throttling queue depths using
400e04393SOmar Sandoval * scalable techniques.
500e04393SOmar Sandoval *
600e04393SOmar Sandoval * Copyright (C) 2017 Facebook
700e04393SOmar Sandoval */
800e04393SOmar Sandoval
900e04393SOmar Sandoval #include <linux/kernel.h>
1000e04393SOmar Sandoval #include <linux/blkdev.h>
1100e04393SOmar Sandoval #include <linux/module.h>
1200e04393SOmar Sandoval #include <linux/sbitmap.h>
1300e04393SOmar Sandoval
14b357e4a6SChaitanya Kulkarni #include <trace/events/block.h>
15b357e4a6SChaitanya Kulkarni
162e9bc346SChristoph Hellwig #include "elevator.h"
1700e04393SOmar Sandoval #include "blk.h"
1800e04393SOmar Sandoval #include "blk-mq.h"
1916b738f6SOmar Sandoval #include "blk-mq-debugfs.h"
2000e04393SOmar Sandoval #include "blk-mq-sched.h"
2100e04393SOmar Sandoval
226c3b7af1SOmar Sandoval #define CREATE_TRACE_POINTS
236c3b7af1SOmar Sandoval #include <trace/events/kyber.h>
246c3b7af1SOmar Sandoval
256e25cb01SOmar Sandoval /*
266e25cb01SOmar Sandoval * Scheduling domains: the device is divided into multiple domains based on the
276e25cb01SOmar Sandoval * request type.
286e25cb01SOmar Sandoval */
2900e04393SOmar Sandoval enum {
3000e04393SOmar Sandoval KYBER_READ,
316e25cb01SOmar Sandoval KYBER_WRITE,
326e25cb01SOmar Sandoval KYBER_DISCARD,
336e25cb01SOmar Sandoval KYBER_OTHER,
3400e04393SOmar Sandoval KYBER_NUM_DOMAINS,
3500e04393SOmar Sandoval };
3600e04393SOmar Sandoval
376c3b7af1SOmar Sandoval static const char *kyber_domain_names[] = {
386c3b7af1SOmar Sandoval [KYBER_READ] = "READ",
396c3b7af1SOmar Sandoval [KYBER_WRITE] = "WRITE",
406c3b7af1SOmar Sandoval [KYBER_DISCARD] = "DISCARD",
416c3b7af1SOmar Sandoval [KYBER_OTHER] = "OTHER",
426c3b7af1SOmar Sandoval };
436c3b7af1SOmar Sandoval
4400e04393SOmar Sandoval enum {
4500e04393SOmar Sandoval /*
4600e04393SOmar Sandoval * In order to prevent starvation of synchronous requests by a flood of
4700e04393SOmar Sandoval * asynchronous requests, we reserve 25% of requests for synchronous
4800e04393SOmar Sandoval * operations.
4900e04393SOmar Sandoval */
50*8cbe62f4SYu Kuai KYBER_DEFAULT_ASYNC_PERCENT = 75,
5100e04393SOmar Sandoval };
5200e04393SOmar Sandoval /*
536e25cb01SOmar Sandoval * Maximum device-wide depth for each scheduling domain.
5400e04393SOmar Sandoval *
556e25cb01SOmar Sandoval * Even for fast devices with lots of tags like NVMe, you can saturate the
566e25cb01SOmar Sandoval * device with only a fraction of the maximum possible queue depth. So, we cap
576e25cb01SOmar Sandoval * these to a reasonable value.
5800e04393SOmar Sandoval */
5900e04393SOmar Sandoval static const unsigned int kyber_depth[] = {
6000e04393SOmar Sandoval [KYBER_READ] = 256,
616e25cb01SOmar Sandoval [KYBER_WRITE] = 128,
626e25cb01SOmar Sandoval [KYBER_DISCARD] = 64,
636e25cb01SOmar Sandoval [KYBER_OTHER] = 16,
6400e04393SOmar Sandoval };
6500e04393SOmar Sandoval
6600e04393SOmar Sandoval /*
676e25cb01SOmar Sandoval * Default latency targets for each scheduling domain.
686e25cb01SOmar Sandoval */
696e25cb01SOmar Sandoval static const u64 kyber_latency_targets[] = {
70f0a0cdddSOmar Sandoval [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
71f0a0cdddSOmar Sandoval [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
72f0a0cdddSOmar Sandoval [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
736e25cb01SOmar Sandoval };
746e25cb01SOmar Sandoval
756e25cb01SOmar Sandoval /*
766e25cb01SOmar Sandoval * Batch size (number of requests we'll dispatch in a row) for each scheduling
776e25cb01SOmar Sandoval * domain.
7800e04393SOmar Sandoval */
7900e04393SOmar Sandoval static const unsigned int kyber_batch_size[] = {
8000e04393SOmar Sandoval [KYBER_READ] = 16,
816e25cb01SOmar Sandoval [KYBER_WRITE] = 8,
826e25cb01SOmar Sandoval [KYBER_DISCARD] = 1,
836e25cb01SOmar Sandoval [KYBER_OTHER] = 1,
846e25cb01SOmar Sandoval };
856e25cb01SOmar Sandoval
866e25cb01SOmar Sandoval /*
876e25cb01SOmar Sandoval * Requests latencies are recorded in a histogram with buckets defined relative
886e25cb01SOmar Sandoval * to the target latency:
896e25cb01SOmar Sandoval *
906e25cb01SOmar Sandoval * <= 1/4 * target latency
916e25cb01SOmar Sandoval * <= 1/2 * target latency
926e25cb01SOmar Sandoval * <= 3/4 * target latency
936e25cb01SOmar Sandoval * <= target latency
946e25cb01SOmar Sandoval * <= 1 1/4 * target latency
956e25cb01SOmar Sandoval * <= 1 1/2 * target latency
966e25cb01SOmar Sandoval * <= 1 3/4 * target latency
976e25cb01SOmar Sandoval * > 1 3/4 * target latency
986e25cb01SOmar Sandoval */
996e25cb01SOmar Sandoval enum {
1006e25cb01SOmar Sandoval /*
1016e25cb01SOmar Sandoval * The width of the latency histogram buckets is
1026e25cb01SOmar Sandoval * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
1036e25cb01SOmar Sandoval */
1046e25cb01SOmar Sandoval KYBER_LATENCY_SHIFT = 2,
1056e25cb01SOmar Sandoval /*
1066e25cb01SOmar Sandoval * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
1076e25cb01SOmar Sandoval * thus, "good".
1086e25cb01SOmar Sandoval */
1096e25cb01SOmar Sandoval KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
1106e25cb01SOmar Sandoval /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
1116e25cb01SOmar Sandoval KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
1126e25cb01SOmar Sandoval };
1136e25cb01SOmar Sandoval
1146e25cb01SOmar Sandoval /*
1156e25cb01SOmar Sandoval * We measure both the total latency and the I/O latency (i.e., latency after
1166e25cb01SOmar Sandoval * submitting to the device).
1176e25cb01SOmar Sandoval */
1186e25cb01SOmar Sandoval enum {
1196e25cb01SOmar Sandoval KYBER_TOTAL_LATENCY,
1206e25cb01SOmar Sandoval KYBER_IO_LATENCY,
1216e25cb01SOmar Sandoval };
1226e25cb01SOmar Sandoval
1236c3b7af1SOmar Sandoval static const char *kyber_latency_type_names[] = {
1246c3b7af1SOmar Sandoval [KYBER_TOTAL_LATENCY] = "total",
1256c3b7af1SOmar Sandoval [KYBER_IO_LATENCY] = "I/O",
1266c3b7af1SOmar Sandoval };
1276c3b7af1SOmar Sandoval
1286e25cb01SOmar Sandoval /*
1296e25cb01SOmar Sandoval * Per-cpu latency histograms: total latency and I/O latency for each scheduling
1306e25cb01SOmar Sandoval * domain except for KYBER_OTHER.
1316e25cb01SOmar Sandoval */
1326e25cb01SOmar Sandoval struct kyber_cpu_latency {
1336e25cb01SOmar Sandoval atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
13400e04393SOmar Sandoval };
13500e04393SOmar Sandoval
136a6088845SJianchao Wang /*
137a6088845SJianchao Wang * There is a same mapping between ctx & hctx and kcq & khd,
138a6088845SJianchao Wang * we use request->mq_ctx->index_hw to index the kcq in khd.
139a6088845SJianchao Wang */
140a6088845SJianchao Wang struct kyber_ctx_queue {
141a6088845SJianchao Wang /*
142a6088845SJianchao Wang * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
143a6088845SJianchao Wang * Also protect the rqs on rq_list when merge.
144a6088845SJianchao Wang */
145a6088845SJianchao Wang spinlock_t lock;
146a6088845SJianchao Wang struct list_head rq_list[KYBER_NUM_DOMAINS];
147a6088845SJianchao Wang } ____cacheline_aligned_in_smp;
148a6088845SJianchao Wang
14900e04393SOmar Sandoval struct kyber_queue_data {
1506c3b7af1SOmar Sandoval struct request_queue *q;
151c4110804SChristoph Hellwig dev_t dev;
1526c3b7af1SOmar Sandoval
15300e04393SOmar Sandoval /*
1546e25cb01SOmar Sandoval * Each scheduling domain has a limited number of in-flight requests
1556e25cb01SOmar Sandoval * device-wide, limited by these tokens.
15600e04393SOmar Sandoval */
15700e04393SOmar Sandoval struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
15800e04393SOmar Sandoval
1596e25cb01SOmar Sandoval struct kyber_cpu_latency __percpu *cpu_latency;
1606e25cb01SOmar Sandoval
1616e25cb01SOmar Sandoval /* Timer for stats aggregation and adjusting domain tokens. */
1626e25cb01SOmar Sandoval struct timer_list timer;
1636e25cb01SOmar Sandoval
1646e25cb01SOmar Sandoval unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
1656e25cb01SOmar Sandoval
1666e25cb01SOmar Sandoval unsigned long latency_timeout[KYBER_OTHER];
1676e25cb01SOmar Sandoval
1686e25cb01SOmar Sandoval int domain_p99[KYBER_OTHER];
1696e25cb01SOmar Sandoval
17000e04393SOmar Sandoval /* Target latencies in nanoseconds. */
1716e25cb01SOmar Sandoval u64 latency_targets[KYBER_OTHER];
17200e04393SOmar Sandoval };
17300e04393SOmar Sandoval
17400e04393SOmar Sandoval struct kyber_hctx_data {
17500e04393SOmar Sandoval spinlock_t lock;
17600e04393SOmar Sandoval struct list_head rqs[KYBER_NUM_DOMAINS];
17700e04393SOmar Sandoval unsigned int cur_domain;
17800e04393SOmar Sandoval unsigned int batching;
179a6088845SJianchao Wang struct kyber_ctx_queue *kcqs;
180a6088845SJianchao Wang struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
18100203ba4SJens Axboe struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
182fcf38cdfSOmar Sandoval struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
18300e04393SOmar Sandoval atomic_t wait_index[KYBER_NUM_DOMAINS];
18400e04393SOmar Sandoval };
18500e04393SOmar Sandoval
186fcf38cdfSOmar Sandoval static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
187fcf38cdfSOmar Sandoval void *key);
188fcf38cdfSOmar Sandoval
kyber_sched_domain(blk_opf_t opf)189d625fecdSBart Van Assche static unsigned int kyber_sched_domain(blk_opf_t opf)
19000e04393SOmar Sandoval {
191d625fecdSBart Van Assche switch (opf & REQ_OP_MASK) {
1926e25cb01SOmar Sandoval case REQ_OP_READ:
19300e04393SOmar Sandoval return KYBER_READ;
1946e25cb01SOmar Sandoval case REQ_OP_WRITE:
1956e25cb01SOmar Sandoval return KYBER_WRITE;
1966e25cb01SOmar Sandoval case REQ_OP_DISCARD:
1976e25cb01SOmar Sandoval return KYBER_DISCARD;
1986e25cb01SOmar Sandoval default:
19900e04393SOmar Sandoval return KYBER_OTHER;
20000e04393SOmar Sandoval }
2016e25cb01SOmar Sandoval }
20200e04393SOmar Sandoval
flush_latency_buckets(struct kyber_queue_data * kqd,struct kyber_cpu_latency * cpu_latency,unsigned int sched_domain,unsigned int type)2036e25cb01SOmar Sandoval static void flush_latency_buckets(struct kyber_queue_data *kqd,
2046e25cb01SOmar Sandoval struct kyber_cpu_latency *cpu_latency,
2056e25cb01SOmar Sandoval unsigned int sched_domain, unsigned int type)
20600e04393SOmar Sandoval {
2076e25cb01SOmar Sandoval unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
2086e25cb01SOmar Sandoval atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
2096e25cb01SOmar Sandoval unsigned int bucket;
21000e04393SOmar Sandoval
2116e25cb01SOmar Sandoval for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
2126e25cb01SOmar Sandoval buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
21300e04393SOmar Sandoval }
21400e04393SOmar Sandoval
21500e04393SOmar Sandoval /*
2166e25cb01SOmar Sandoval * Calculate the histogram bucket with the given percentile rank, or -1 if there
2176e25cb01SOmar Sandoval * aren't enough samples yet.
21800e04393SOmar Sandoval */
calculate_percentile(struct kyber_queue_data * kqd,unsigned int sched_domain,unsigned int type,unsigned int percentile)2196e25cb01SOmar Sandoval static int calculate_percentile(struct kyber_queue_data *kqd,
2206e25cb01SOmar Sandoval unsigned int sched_domain, unsigned int type,
2216e25cb01SOmar Sandoval unsigned int percentile)
22200e04393SOmar Sandoval {
2236e25cb01SOmar Sandoval unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
2246e25cb01SOmar Sandoval unsigned int bucket, samples = 0, percentile_samples;
2256e25cb01SOmar Sandoval
2266e25cb01SOmar Sandoval for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
2276e25cb01SOmar Sandoval samples += buckets[bucket];
2286e25cb01SOmar Sandoval
2296e25cb01SOmar Sandoval if (!samples)
2306e25cb01SOmar Sandoval return -1;
23100e04393SOmar Sandoval
23200e04393SOmar Sandoval /*
2336e25cb01SOmar Sandoval * We do the calculation once we have 500 samples or one second passes
2346e25cb01SOmar Sandoval * since the first sample was recorded, whichever comes first.
23500e04393SOmar Sandoval */
2366e25cb01SOmar Sandoval if (!kqd->latency_timeout[sched_domain])
2376e25cb01SOmar Sandoval kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
2386e25cb01SOmar Sandoval if (samples < 500 &&
2396e25cb01SOmar Sandoval time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
2406e25cb01SOmar Sandoval return -1;
24100e04393SOmar Sandoval }
2426e25cb01SOmar Sandoval kqd->latency_timeout[sched_domain] = 0;
2436e25cb01SOmar Sandoval
2446e25cb01SOmar Sandoval percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
2456e25cb01SOmar Sandoval for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
2466e25cb01SOmar Sandoval if (buckets[bucket] >= percentile_samples)
2476e25cb01SOmar Sandoval break;
2486e25cb01SOmar Sandoval percentile_samples -= buckets[bucket];
2496e25cb01SOmar Sandoval }
2506e25cb01SOmar Sandoval memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
2516e25cb01SOmar Sandoval
252c4110804SChristoph Hellwig trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain],
2536c3b7af1SOmar Sandoval kyber_latency_type_names[type], percentile,
2546c3b7af1SOmar Sandoval bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
2556c3b7af1SOmar Sandoval
2566e25cb01SOmar Sandoval return bucket;
25700e04393SOmar Sandoval }
25800e04393SOmar Sandoval
kyber_resize_domain(struct kyber_queue_data * kqd,unsigned int sched_domain,unsigned int depth)2596e25cb01SOmar Sandoval static void kyber_resize_domain(struct kyber_queue_data *kqd,
2606e25cb01SOmar Sandoval unsigned int sched_domain, unsigned int depth)
2616e25cb01SOmar Sandoval {
26200e04393SOmar Sandoval depth = clamp(depth, 1U, kyber_depth[sched_domain]);
2636c3b7af1SOmar Sandoval if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
26400e04393SOmar Sandoval sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
265c4110804SChristoph Hellwig trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain],
2666c3b7af1SOmar Sandoval depth);
2676c3b7af1SOmar Sandoval }
26800e04393SOmar Sandoval }
26900e04393SOmar Sandoval
kyber_timer_fn(struct timer_list * t)2706e25cb01SOmar Sandoval static void kyber_timer_fn(struct timer_list *t)
27100e04393SOmar Sandoval {
27241cb0855SIngo Molnar struct kyber_queue_data *kqd = timer_container_of(kqd, t, timer);
2736e25cb01SOmar Sandoval unsigned int sched_domain;
2746e25cb01SOmar Sandoval int cpu;
2756e25cb01SOmar Sandoval bool bad = false;
2766e25cb01SOmar Sandoval
2776e25cb01SOmar Sandoval /* Sum all of the per-cpu latency histograms. */
2786e25cb01SOmar Sandoval for_each_online_cpu(cpu) {
2796e25cb01SOmar Sandoval struct kyber_cpu_latency *cpu_latency;
2806e25cb01SOmar Sandoval
2816e25cb01SOmar Sandoval cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
2826e25cb01SOmar Sandoval for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
2836e25cb01SOmar Sandoval flush_latency_buckets(kqd, cpu_latency, sched_domain,
2846e25cb01SOmar Sandoval KYBER_TOTAL_LATENCY);
2856e25cb01SOmar Sandoval flush_latency_buckets(kqd, cpu_latency, sched_domain,
2866e25cb01SOmar Sandoval KYBER_IO_LATENCY);
2876e25cb01SOmar Sandoval }
2886e25cb01SOmar Sandoval }
2896e25cb01SOmar Sandoval
2906e25cb01SOmar Sandoval /*
2916e25cb01SOmar Sandoval * Check if any domains have a high I/O latency, which might indicate
2926e25cb01SOmar Sandoval * congestion in the device. Note that we use the p90; we don't want to
2936e25cb01SOmar Sandoval * be too sensitive to outliers here.
2946e25cb01SOmar Sandoval */
2956e25cb01SOmar Sandoval for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
2966e25cb01SOmar Sandoval int p90;
2976e25cb01SOmar Sandoval
2986e25cb01SOmar Sandoval p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
2996e25cb01SOmar Sandoval 90);
3006e25cb01SOmar Sandoval if (p90 >= KYBER_GOOD_BUCKETS)
3016e25cb01SOmar Sandoval bad = true;
3026e25cb01SOmar Sandoval }
3036e25cb01SOmar Sandoval
3046e25cb01SOmar Sandoval /*
3056e25cb01SOmar Sandoval * Adjust the scheduling domain depths. If we determined that there was
3066e25cb01SOmar Sandoval * congestion, we throttle all domains with good latencies. Either way,
3076e25cb01SOmar Sandoval * we ease up on throttling domains with bad latencies.
3086e25cb01SOmar Sandoval */
3096e25cb01SOmar Sandoval for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
31000e04393SOmar Sandoval unsigned int orig_depth, depth;
3116e25cb01SOmar Sandoval int p99;
31200e04393SOmar Sandoval
3136e25cb01SOmar Sandoval p99 = calculate_percentile(kqd, sched_domain,
3146e25cb01SOmar Sandoval KYBER_TOTAL_LATENCY, 99);
3156e25cb01SOmar Sandoval /*
3166e25cb01SOmar Sandoval * This is kind of subtle: different domains will not
3176e25cb01SOmar Sandoval * necessarily have enough samples to calculate the latency
3186e25cb01SOmar Sandoval * percentiles during the same window, so we have to remember
3196e25cb01SOmar Sandoval * the p99 for the next time we observe congestion; once we do,
3206e25cb01SOmar Sandoval * we don't want to throttle again until we get more data, so we
3216e25cb01SOmar Sandoval * reset it to -1.
3226e25cb01SOmar Sandoval */
3236e25cb01SOmar Sandoval if (bad) {
3246e25cb01SOmar Sandoval if (p99 < 0)
3256e25cb01SOmar Sandoval p99 = kqd->domain_p99[sched_domain];
3266e25cb01SOmar Sandoval kqd->domain_p99[sched_domain] = -1;
3276e25cb01SOmar Sandoval } else if (p99 >= 0) {
3286e25cb01SOmar Sandoval kqd->domain_p99[sched_domain] = p99;
32900e04393SOmar Sandoval }
3306e25cb01SOmar Sandoval if (p99 < 0)
3316e25cb01SOmar Sandoval continue;
33200e04393SOmar Sandoval
33300e04393SOmar Sandoval /*
3346e25cb01SOmar Sandoval * If this domain has bad latency, throttle less. Otherwise,
3356e25cb01SOmar Sandoval * throttle more iff we determined that there is congestion.
3366e25cb01SOmar Sandoval *
3376e25cb01SOmar Sandoval * The new depth is scaled linearly with the p99 latency vs the
3386e25cb01SOmar Sandoval * latency target. E.g., if the p99 is 3/4 of the target, then
3396e25cb01SOmar Sandoval * we throttle down to 3/4 of the current depth, and if the p99
3406e25cb01SOmar Sandoval * is 2x the target, then we double the depth.
34100e04393SOmar Sandoval */
3426e25cb01SOmar Sandoval if (bad || p99 >= KYBER_GOOD_BUCKETS) {
3436e25cb01SOmar Sandoval orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
3446e25cb01SOmar Sandoval depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
3456e25cb01SOmar Sandoval kyber_resize_domain(kqd, sched_domain, depth);
3466e25cb01SOmar Sandoval }
3476e25cb01SOmar Sandoval }
34800e04393SOmar Sandoval }
34900e04393SOmar Sandoval
kyber_queue_data_alloc(struct request_queue * q)35000e04393SOmar Sandoval static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
35100e04393SOmar Sandoval {
35200e04393SOmar Sandoval struct kyber_queue_data *kqd;
35300e04393SOmar Sandoval int ret = -ENOMEM;
35400e04393SOmar Sandoval int i;
35500e04393SOmar Sandoval
3566e25cb01SOmar Sandoval kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
35700e04393SOmar Sandoval if (!kqd)
35800e04393SOmar Sandoval goto err;
35900e04393SOmar Sandoval
3606c3b7af1SOmar Sandoval kqd->q = q;
361c4110804SChristoph Hellwig kqd->dev = disk_devt(q->disk);
3626c3b7af1SOmar Sandoval
3636e25cb01SOmar Sandoval kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
3646e25cb01SOmar Sandoval GFP_KERNEL | __GFP_ZERO);
3656e25cb01SOmar Sandoval if (!kqd->cpu_latency)
36600e04393SOmar Sandoval goto err_kqd;
36700e04393SOmar Sandoval
3686e25cb01SOmar Sandoval timer_setup(&kqd->timer, kyber_timer_fn, 0);
3696e25cb01SOmar Sandoval
37000e04393SOmar Sandoval for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
37100e04393SOmar Sandoval WARN_ON(!kyber_depth[i]);
37200e04393SOmar Sandoval WARN_ON(!kyber_batch_size[i]);
37300e04393SOmar Sandoval ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
374fa2a1f60SOmar Sandoval kyber_depth[i], -1, false,
375fa2a1f60SOmar Sandoval GFP_KERNEL, q->node);
37600e04393SOmar Sandoval if (ret) {
37700e04393SOmar Sandoval while (--i >= 0)
37800e04393SOmar Sandoval sbitmap_queue_free(&kqd->domain_tokens[i]);
3796e25cb01SOmar Sandoval goto err_buckets;
38000e04393SOmar Sandoval }
38100e04393SOmar Sandoval }
38200e04393SOmar Sandoval
3836e25cb01SOmar Sandoval for (i = 0; i < KYBER_OTHER; i++) {
3846e25cb01SOmar Sandoval kqd->domain_p99[i] = -1;
3856e25cb01SOmar Sandoval kqd->latency_targets[i] = kyber_latency_targets[i];
3866e25cb01SOmar Sandoval }
3876e25cb01SOmar Sandoval
38800e04393SOmar Sandoval return kqd;
38900e04393SOmar Sandoval
3906e25cb01SOmar Sandoval err_buckets:
3916e25cb01SOmar Sandoval free_percpu(kqd->cpu_latency);
39200e04393SOmar Sandoval err_kqd:
39300e04393SOmar Sandoval kfree(kqd);
39400e04393SOmar Sandoval err:
39500e04393SOmar Sandoval return ERR_PTR(ret);
39600e04393SOmar Sandoval }
39700e04393SOmar Sandoval
kyber_depth_updated(struct request_queue * q)3987d337eefSYu Kuai static void kyber_depth_updated(struct request_queue *q)
3997d337eefSYu Kuai {
400*8cbe62f4SYu Kuai blk_mq_set_min_shallow_depth(q, q->async_depth);
4017d337eefSYu Kuai }
4027d337eefSYu Kuai
kyber_init_sched(struct request_queue * q,struct elevator_queue * eq)40349811586SNilay Shroff static int kyber_init_sched(struct request_queue *q, struct elevator_queue *eq)
40400e04393SOmar Sandoval {
4056e25cb01SOmar Sandoval blk_stat_enable_accounting(q);
4066e25cb01SOmar Sandoval
4074d337cebSMing Lei blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
4084d337cebSMing Lei
40900e04393SOmar Sandoval q->elevator = eq;
410*8cbe62f4SYu Kuai q->async_depth = q->nr_requests * KYBER_DEFAULT_ASYNC_PERCENT / 100;
4117d337eefSYu Kuai kyber_depth_updated(q);
41200e04393SOmar Sandoval
41300e04393SOmar Sandoval return 0;
41400e04393SOmar Sandoval }
41500e04393SOmar Sandoval
kyber_alloc_sched_data(struct request_queue * q)416d4c3ef56SNilay Shroff static void *kyber_alloc_sched_data(struct request_queue *q)
417d4c3ef56SNilay Shroff {
418d4c3ef56SNilay Shroff struct kyber_queue_data *kqd;
419d4c3ef56SNilay Shroff
420d4c3ef56SNilay Shroff kqd = kyber_queue_data_alloc(q);
421d4c3ef56SNilay Shroff if (IS_ERR(kqd))
422d4c3ef56SNilay Shroff return NULL;
423d4c3ef56SNilay Shroff
424d4c3ef56SNilay Shroff return kqd;
425d4c3ef56SNilay Shroff }
426d4c3ef56SNilay Shroff
kyber_exit_sched(struct elevator_queue * e)42700e04393SOmar Sandoval static void kyber_exit_sched(struct elevator_queue *e)
42800e04393SOmar Sandoval {
42900e04393SOmar Sandoval struct kyber_queue_data *kqd = e->elevator_data;
43000e04393SOmar Sandoval
431292a089dSSteven Rostedt (Google) timer_shutdown_sync(&kqd->timer);
43268497092SJens Axboe blk_stat_disable_accounting(kqd->q);
433d4c3ef56SNilay Shroff }
434d4c3ef56SNilay Shroff
kyber_free_sched_data(void * elv_data)435d4c3ef56SNilay Shroff static void kyber_free_sched_data(void *elv_data)
436d4c3ef56SNilay Shroff {
437d4c3ef56SNilay Shroff struct kyber_queue_data *kqd = elv_data;
438d4c3ef56SNilay Shroff int i;
439d4c3ef56SNilay Shroff
440d4c3ef56SNilay Shroff if (!kqd)
441d4c3ef56SNilay Shroff return;
44200e04393SOmar Sandoval
44300e04393SOmar Sandoval for (i = 0; i < KYBER_NUM_DOMAINS; i++)
44400e04393SOmar Sandoval sbitmap_queue_free(&kqd->domain_tokens[i]);
4456e25cb01SOmar Sandoval free_percpu(kqd->cpu_latency);
44600e04393SOmar Sandoval kfree(kqd);
44700e04393SOmar Sandoval }
44800e04393SOmar Sandoval
kyber_ctx_queue_init(struct kyber_ctx_queue * kcq)449a6088845SJianchao Wang static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
450a6088845SJianchao Wang {
451a6088845SJianchao Wang unsigned int i;
452a6088845SJianchao Wang
453a6088845SJianchao Wang spin_lock_init(&kcq->lock);
454a6088845SJianchao Wang for (i = 0; i < KYBER_NUM_DOMAINS; i++)
455a6088845SJianchao Wang INIT_LIST_HEAD(&kcq->rq_list[i]);
456a6088845SJianchao Wang }
457a6088845SJianchao Wang
kyber_init_hctx(struct blk_mq_hw_ctx * hctx,unsigned int hctx_idx)458ffa772cfSYang Yang static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
459ffa772cfSYang Yang {
46000e04393SOmar Sandoval struct kyber_hctx_data *khd;
46100e04393SOmar Sandoval int i;
46200e04393SOmar Sandoval
46300e04393SOmar Sandoval khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
46400e04393SOmar Sandoval if (!khd)
46500e04393SOmar Sandoval return -ENOMEM;
46600e04393SOmar Sandoval
467a6088845SJianchao Wang khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
468a6088845SJianchao Wang sizeof(struct kyber_ctx_queue),
469a6088845SJianchao Wang GFP_KERNEL, hctx->numa_node);
470a6088845SJianchao Wang if (!khd->kcqs)
471a6088845SJianchao Wang goto err_khd;
472a6088845SJianchao Wang
473a6088845SJianchao Wang for (i = 0; i < hctx->nr_ctx; i++)
474a6088845SJianchao Wang kyber_ctx_queue_init(&khd->kcqs[i]);
475a6088845SJianchao Wang
476a6088845SJianchao Wang for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
477a6088845SJianchao Wang if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
478efe1f3a1SMing Lei ilog2(8), GFP_KERNEL, hctx->numa_node,
479c548e62bSMing Lei false, false)) {
480a6088845SJianchao Wang while (--i >= 0)
481a6088845SJianchao Wang sbitmap_free(&khd->kcq_map[i]);
482a6088845SJianchao Wang goto err_kcqs;
483a6088845SJianchao Wang }
484a6088845SJianchao Wang }
485a6088845SJianchao Wang
48600e04393SOmar Sandoval spin_lock_init(&khd->lock);
48700e04393SOmar Sandoval
48800e04393SOmar Sandoval for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
48900e04393SOmar Sandoval INIT_LIST_HEAD(&khd->rqs[i]);
49000203ba4SJens Axboe khd->domain_wait[i].sbq = NULL;
49100203ba4SJens Axboe init_waitqueue_func_entry(&khd->domain_wait[i].wait,
492fcf38cdfSOmar Sandoval kyber_domain_wake);
49300203ba4SJens Axboe khd->domain_wait[i].wait.private = hctx;
49400203ba4SJens Axboe INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
49500e04393SOmar Sandoval atomic_set(&khd->wait_index[i], 0);
49600e04393SOmar Sandoval }
49700e04393SOmar Sandoval
49800e04393SOmar Sandoval khd->cur_domain = 0;
49900e04393SOmar Sandoval khd->batching = 0;
50000e04393SOmar Sandoval
50100e04393SOmar Sandoval hctx->sched_data = khd;
50200e04393SOmar Sandoval
50300e04393SOmar Sandoval return 0;
504a6088845SJianchao Wang
505a6088845SJianchao Wang err_kcqs:
506a6088845SJianchao Wang kfree(khd->kcqs);
507a6088845SJianchao Wang err_khd:
508a6088845SJianchao Wang kfree(khd);
509a6088845SJianchao Wang return -ENOMEM;
51000e04393SOmar Sandoval }
51100e04393SOmar Sandoval
kyber_exit_hctx(struct blk_mq_hw_ctx * hctx,unsigned int hctx_idx)51200e04393SOmar Sandoval static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
51300e04393SOmar Sandoval {
514a6088845SJianchao Wang struct kyber_hctx_data *khd = hctx->sched_data;
515a6088845SJianchao Wang int i;
516a6088845SJianchao Wang
517a6088845SJianchao Wang for (i = 0; i < KYBER_NUM_DOMAINS; i++)
518a6088845SJianchao Wang sbitmap_free(&khd->kcq_map[i]);
519a6088845SJianchao Wang kfree(khd->kcqs);
52000e04393SOmar Sandoval kfree(hctx->sched_data);
52100e04393SOmar Sandoval }
52200e04393SOmar Sandoval
rq_get_domain_token(struct request * rq)52300e04393SOmar Sandoval static int rq_get_domain_token(struct request *rq)
52400e04393SOmar Sandoval {
52500e04393SOmar Sandoval return (long)rq->elv.priv[0];
52600e04393SOmar Sandoval }
52700e04393SOmar Sandoval
rq_set_domain_token(struct request * rq,int token)52800e04393SOmar Sandoval static void rq_set_domain_token(struct request *rq, int token)
52900e04393SOmar Sandoval {
53000e04393SOmar Sandoval rq->elv.priv[0] = (void *)(long)token;
53100e04393SOmar Sandoval }
53200e04393SOmar Sandoval
rq_clear_domain_token(struct kyber_queue_data * kqd,struct request * rq)53300e04393SOmar Sandoval static void rq_clear_domain_token(struct kyber_queue_data *kqd,
53400e04393SOmar Sandoval struct request *rq)
53500e04393SOmar Sandoval {
53600e04393SOmar Sandoval unsigned int sched_domain;
53700e04393SOmar Sandoval int nr;
53800e04393SOmar Sandoval
53900e04393SOmar Sandoval nr = rq_get_domain_token(rq);
54000e04393SOmar Sandoval if (nr != -1) {
541a6088845SJianchao Wang sched_domain = kyber_sched_domain(rq->cmd_flags);
54200e04393SOmar Sandoval sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
54300e04393SOmar Sandoval rq->mq_ctx->cpu);
54400e04393SOmar Sandoval }
54500e04393SOmar Sandoval }
54600e04393SOmar Sandoval
kyber_limit_depth(blk_opf_t opf,struct blk_mq_alloc_data * data)547d625fecdSBart Van Assche static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
54800e04393SOmar Sandoval {
549*8cbe62f4SYu Kuai if (!blk_mq_is_sync_read(opf))
550*8cbe62f4SYu Kuai data->shallow_depth = data->q->async_depth;
5515bbf4e5aSChristoph Hellwig }
5525bbf4e5aSChristoph Hellwig
kyber_bio_merge(struct request_queue * q,struct bio * bio,unsigned int nr_segs)553efed9a33SOmar Sandoval static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
55414ccb66bSChristoph Hellwig unsigned int nr_segs)
555a6088845SJianchao Wang {
556efed9a33SOmar Sandoval struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
55761667cb6SGuixin Liu struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(bio->bi_opf, ctx);
558a6088845SJianchao Wang struct kyber_hctx_data *khd = hctx->sched_data;
559f31967f0SJens Axboe struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
560a6088845SJianchao Wang unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
561a6088845SJianchao Wang struct list_head *rq_list = &kcq->rq_list[sched_domain];
562a6088845SJianchao Wang bool merged;
563a6088845SJianchao Wang
564a6088845SJianchao Wang spin_lock(&kcq->lock);
565bdc6a287SBaolin Wang merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
566a6088845SJianchao Wang spin_unlock(&kcq->lock);
567a6088845SJianchao Wang
568a6088845SJianchao Wang return merged;
569a6088845SJianchao Wang }
570a6088845SJianchao Wang
kyber_prepare_request(struct request * rq)5715d9c305bSChristoph Hellwig static void kyber_prepare_request(struct request *rq)
5725bbf4e5aSChristoph Hellwig {
57300e04393SOmar Sandoval rq_set_domain_token(rq, -1);
57400e04393SOmar Sandoval }
57500e04393SOmar Sandoval
kyber_insert_requests(struct blk_mq_hw_ctx * hctx,struct list_head * rq_list,blk_insert_t flags)576a6088845SJianchao Wang static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
57793fffe16SChristoph Hellwig struct list_head *rq_list,
57893fffe16SChristoph Hellwig blk_insert_t flags)
579a6088845SJianchao Wang {
580a6088845SJianchao Wang struct kyber_hctx_data *khd = hctx->sched_data;
581a6088845SJianchao Wang struct request *rq, *next;
582a6088845SJianchao Wang
583a6088845SJianchao Wang list_for_each_entry_safe(rq, next, rq_list, queuelist) {
584a6088845SJianchao Wang unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
585f31967f0SJens Axboe struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
586a6088845SJianchao Wang struct list_head *head = &kcq->rq_list[sched_domain];
587a6088845SJianchao Wang
588a6088845SJianchao Wang spin_lock(&kcq->lock);
589fb7b9b02SVincent Fu trace_block_rq_insert(rq);
59093fffe16SChristoph Hellwig if (flags & BLK_MQ_INSERT_AT_HEAD)
591a6088845SJianchao Wang list_move(&rq->queuelist, head);
592a6088845SJianchao Wang else
593a6088845SJianchao Wang list_move_tail(&rq->queuelist, head);
594a6088845SJianchao Wang sbitmap_set_bit(&khd->kcq_map[sched_domain],
595f31967f0SJens Axboe rq->mq_ctx->index_hw[hctx->type]);
596a6088845SJianchao Wang spin_unlock(&kcq->lock);
597a6088845SJianchao Wang }
598a6088845SJianchao Wang }
599a6088845SJianchao Wang
kyber_finish_request(struct request * rq)6007b9e9361SChristoph Hellwig static void kyber_finish_request(struct request *rq)
60100e04393SOmar Sandoval {
6027b9e9361SChristoph Hellwig struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
60300e04393SOmar Sandoval
60400e04393SOmar Sandoval rq_clear_domain_token(kqd, rq);
60500e04393SOmar Sandoval }
60600e04393SOmar Sandoval
add_latency_sample(struct kyber_cpu_latency * cpu_latency,unsigned int sched_domain,unsigned int type,u64 target,u64 latency)6076e25cb01SOmar Sandoval static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
6086e25cb01SOmar Sandoval unsigned int sched_domain, unsigned int type,
6096e25cb01SOmar Sandoval u64 target, u64 latency)
61000e04393SOmar Sandoval {
6116e25cb01SOmar Sandoval unsigned int bucket;
6126e25cb01SOmar Sandoval u64 divisor;
61300e04393SOmar Sandoval
6146e25cb01SOmar Sandoval if (latency > 0) {
6156e25cb01SOmar Sandoval divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
6166e25cb01SOmar Sandoval bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
6176e25cb01SOmar Sandoval KYBER_LATENCY_BUCKETS - 1);
6186e25cb01SOmar Sandoval } else {
6196e25cb01SOmar Sandoval bucket = 0;
62000e04393SOmar Sandoval }
62100e04393SOmar Sandoval
6226e25cb01SOmar Sandoval atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
6236e25cb01SOmar Sandoval }
6246e25cb01SOmar Sandoval
kyber_completed_request(struct request * rq,u64 now)6256e25cb01SOmar Sandoval static void kyber_completed_request(struct request *rq, u64 now)
6266e25cb01SOmar Sandoval {
6276e25cb01SOmar Sandoval struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
6286e25cb01SOmar Sandoval struct kyber_cpu_latency *cpu_latency;
6296e25cb01SOmar Sandoval unsigned int sched_domain;
6306e25cb01SOmar Sandoval u64 target;
6316e25cb01SOmar Sandoval
6326e25cb01SOmar Sandoval sched_domain = kyber_sched_domain(rq->cmd_flags);
6336e25cb01SOmar Sandoval if (sched_domain == KYBER_OTHER)
63400e04393SOmar Sandoval return;
63500e04393SOmar Sandoval
6366e25cb01SOmar Sandoval cpu_latency = get_cpu_ptr(kqd->cpu_latency);
6376e25cb01SOmar Sandoval target = kqd->latency_targets[sched_domain];
6386e25cb01SOmar Sandoval add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
6396e25cb01SOmar Sandoval target, now - rq->start_time_ns);
6406e25cb01SOmar Sandoval add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
6416e25cb01SOmar Sandoval now - rq->io_start_time_ns);
6426e25cb01SOmar Sandoval put_cpu_ptr(kqd->cpu_latency);
64300e04393SOmar Sandoval
6446e25cb01SOmar Sandoval timer_reduce(&kqd->timer, jiffies + HZ / 10);
64500e04393SOmar Sandoval }
64600e04393SOmar Sandoval
647a6088845SJianchao Wang struct flush_kcq_data {
648a6088845SJianchao Wang struct kyber_hctx_data *khd;
64900e04393SOmar Sandoval unsigned int sched_domain;
650a6088845SJianchao Wang struct list_head *list;
651a6088845SJianchao Wang };
65200e04393SOmar Sandoval
flush_busy_kcq(struct sbitmap * sb,unsigned int bitnr,void * data)653a6088845SJianchao Wang static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
654a6088845SJianchao Wang {
655a6088845SJianchao Wang struct flush_kcq_data *flush_data = data;
656a6088845SJianchao Wang struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
657a6088845SJianchao Wang
658a6088845SJianchao Wang spin_lock(&kcq->lock);
659a6088845SJianchao Wang list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
660a6088845SJianchao Wang flush_data->list);
661a6088845SJianchao Wang sbitmap_clear_bit(sb, bitnr);
662a6088845SJianchao Wang spin_unlock(&kcq->lock);
663a6088845SJianchao Wang
664a6088845SJianchao Wang return true;
66500e04393SOmar Sandoval }
666a6088845SJianchao Wang
kyber_flush_busy_kcqs(struct kyber_hctx_data * khd,unsigned int sched_domain,struct list_head * list)667a6088845SJianchao Wang static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
668a6088845SJianchao Wang unsigned int sched_domain,
669a6088845SJianchao Wang struct list_head *list)
670a6088845SJianchao Wang {
671a6088845SJianchao Wang struct flush_kcq_data data = {
672a6088845SJianchao Wang .khd = khd,
673a6088845SJianchao Wang .sched_domain = sched_domain,
674a6088845SJianchao Wang .list = list,
675a6088845SJianchao Wang };
676a6088845SJianchao Wang
677a6088845SJianchao Wang sbitmap_for_each_set(&khd->kcq_map[sched_domain],
678a6088845SJianchao Wang flush_busy_kcq, &data);
67900e04393SOmar Sandoval }
68000e04393SOmar Sandoval
kyber_domain_wake(wait_queue_entry_t * wqe,unsigned mode,int flags,void * key)68100203ba4SJens Axboe static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
68200e04393SOmar Sandoval void *key)
68300e04393SOmar Sandoval {
68400203ba4SJens Axboe struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
68500203ba4SJens Axboe struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
68600e04393SOmar Sandoval
68700203ba4SJens Axboe sbitmap_del_wait_queue(wait);
68800e04393SOmar Sandoval blk_mq_run_hw_queue(hctx, true);
68900e04393SOmar Sandoval return 1;
69000e04393SOmar Sandoval }
69100e04393SOmar Sandoval
kyber_get_domain_token(struct kyber_queue_data * kqd,struct kyber_hctx_data * khd,struct blk_mq_hw_ctx * hctx)69200e04393SOmar Sandoval static int kyber_get_domain_token(struct kyber_queue_data *kqd,
69300e04393SOmar Sandoval struct kyber_hctx_data *khd,
69400e04393SOmar Sandoval struct blk_mq_hw_ctx *hctx)
69500e04393SOmar Sandoval {
69600e04393SOmar Sandoval unsigned int sched_domain = khd->cur_domain;
69700e04393SOmar Sandoval struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
69800203ba4SJens Axboe struct sbq_wait *wait = &khd->domain_wait[sched_domain];
69900e04393SOmar Sandoval struct sbq_wait_state *ws;
70000e04393SOmar Sandoval int nr;
70100e04393SOmar Sandoval
70200e04393SOmar Sandoval nr = __sbitmap_queue_get(domain_tokens);
70300e04393SOmar Sandoval
70400e04393SOmar Sandoval /*
70500e04393SOmar Sandoval * If we failed to get a domain token, make sure the hardware queue is
70600e04393SOmar Sandoval * run when one becomes available. Note that this is serialized on
70700e04393SOmar Sandoval * khd->lock, but we still need to be careful about the waker.
70800e04393SOmar Sandoval */
70900203ba4SJens Axboe if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
71000e04393SOmar Sandoval ws = sbq_wait_ptr(domain_tokens,
71100e04393SOmar Sandoval &khd->wait_index[sched_domain]);
712fcf38cdfSOmar Sandoval khd->domain_ws[sched_domain] = ws;
71300203ba4SJens Axboe sbitmap_add_wait_queue(domain_tokens, ws, wait);
71400e04393SOmar Sandoval
71500e04393SOmar Sandoval /*
71600e04393SOmar Sandoval * Try again in case a token was freed before we got on the wait
717fcf38cdfSOmar Sandoval * queue.
71800e04393SOmar Sandoval */
71900e04393SOmar Sandoval nr = __sbitmap_queue_get(domain_tokens);
720fcf38cdfSOmar Sandoval }
7218cf46660SOmar Sandoval
722fcf38cdfSOmar Sandoval /*
723fcf38cdfSOmar Sandoval * If we got a token while we were on the wait queue, remove ourselves
724fcf38cdfSOmar Sandoval * from the wait queue to ensure that all wake ups make forward
725fcf38cdfSOmar Sandoval * progress. It's possible that the waker already deleted the entry
726fcf38cdfSOmar Sandoval * between the !list_empty_careful() check and us grabbing the lock, but
727fcf38cdfSOmar Sandoval * list_del_init() is okay with that.
728fcf38cdfSOmar Sandoval */
72900203ba4SJens Axboe if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
730fcf38cdfSOmar Sandoval ws = khd->domain_ws[sched_domain];
731fcf38cdfSOmar Sandoval spin_lock_irq(&ws->wait.lock);
73200203ba4SJens Axboe sbitmap_del_wait_queue(wait);
733fcf38cdfSOmar Sandoval spin_unlock_irq(&ws->wait.lock);
7348cf46660SOmar Sandoval }
735fcf38cdfSOmar Sandoval
73600e04393SOmar Sandoval return nr;
73700e04393SOmar Sandoval }
73800e04393SOmar Sandoval
73900e04393SOmar Sandoval static struct request *
kyber_dispatch_cur_domain(struct kyber_queue_data * kqd,struct kyber_hctx_data * khd,struct blk_mq_hw_ctx * hctx)74000e04393SOmar Sandoval kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
74100e04393SOmar Sandoval struct kyber_hctx_data *khd,
742a6088845SJianchao Wang struct blk_mq_hw_ctx *hctx)
74300e04393SOmar Sandoval {
74400e04393SOmar Sandoval struct list_head *rqs;
74500e04393SOmar Sandoval struct request *rq;
74600e04393SOmar Sandoval int nr;
74700e04393SOmar Sandoval
74800e04393SOmar Sandoval rqs = &khd->rqs[khd->cur_domain];
74900e04393SOmar Sandoval
75000e04393SOmar Sandoval /*
751a6088845SJianchao Wang * If we already have a flushed request, then we just need to get a
752a6088845SJianchao Wang * token for it. Otherwise, if there are pending requests in the kcqs,
753a6088845SJianchao Wang * flush the kcqs, but only if we can get a token. If not, we should
754a6088845SJianchao Wang * leave the requests in the kcqs so that they can be merged. Note that
755a6088845SJianchao Wang * khd->lock serializes the flushes, so if we observed any bit set in
756a6088845SJianchao Wang * the kcq_map, we will always get a request.
75700e04393SOmar Sandoval */
75800e04393SOmar Sandoval rq = list_first_entry_or_null(rqs, struct request, queuelist);
75900e04393SOmar Sandoval if (rq) {
76000e04393SOmar Sandoval nr = kyber_get_domain_token(kqd, khd, hctx);
76100e04393SOmar Sandoval if (nr >= 0) {
76200e04393SOmar Sandoval khd->batching++;
76300e04393SOmar Sandoval rq_set_domain_token(rq, nr);
76400e04393SOmar Sandoval list_del_init(&rq->queuelist);
76500e04393SOmar Sandoval return rq;
7666c3b7af1SOmar Sandoval } else {
767c4110804SChristoph Hellwig trace_kyber_throttled(kqd->dev,
7686c3b7af1SOmar Sandoval kyber_domain_names[khd->cur_domain]);
76900e04393SOmar Sandoval }
770a6088845SJianchao Wang } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
771a6088845SJianchao Wang nr = kyber_get_domain_token(kqd, khd, hctx);
772a6088845SJianchao Wang if (nr >= 0) {
773a6088845SJianchao Wang kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
774a6088845SJianchao Wang rq = list_first_entry(rqs, struct request, queuelist);
775a6088845SJianchao Wang khd->batching++;
776a6088845SJianchao Wang rq_set_domain_token(rq, nr);
777a6088845SJianchao Wang list_del_init(&rq->queuelist);
778a6088845SJianchao Wang return rq;
7796c3b7af1SOmar Sandoval } else {
780c4110804SChristoph Hellwig trace_kyber_throttled(kqd->dev,
7816c3b7af1SOmar Sandoval kyber_domain_names[khd->cur_domain]);
782a6088845SJianchao Wang }
78300e04393SOmar Sandoval }
78400e04393SOmar Sandoval
78500e04393SOmar Sandoval /* There were either no pending requests or no tokens. */
78600e04393SOmar Sandoval return NULL;
78700e04393SOmar Sandoval }
78800e04393SOmar Sandoval
kyber_dispatch_request(struct blk_mq_hw_ctx * hctx)78900e04393SOmar Sandoval static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
79000e04393SOmar Sandoval {
79100e04393SOmar Sandoval struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
79200e04393SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data;
79300e04393SOmar Sandoval struct request *rq;
79400e04393SOmar Sandoval int i;
79500e04393SOmar Sandoval
79600e04393SOmar Sandoval spin_lock(&khd->lock);
79700e04393SOmar Sandoval
79800e04393SOmar Sandoval /*
79900e04393SOmar Sandoval * First, if we are still entitled to batch, try to dispatch a request
80000e04393SOmar Sandoval * from the batch.
80100e04393SOmar Sandoval */
80200e04393SOmar Sandoval if (khd->batching < kyber_batch_size[khd->cur_domain]) {
803a6088845SJianchao Wang rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
80400e04393SOmar Sandoval if (rq)
80500e04393SOmar Sandoval goto out;
80600e04393SOmar Sandoval }
80700e04393SOmar Sandoval
80800e04393SOmar Sandoval /*
80900e04393SOmar Sandoval * Either,
81000e04393SOmar Sandoval * 1. We were no longer entitled to a batch.
81100e04393SOmar Sandoval * 2. The domain we were batching didn't have any requests.
81200e04393SOmar Sandoval * 3. The domain we were batching was out of tokens.
81300e04393SOmar Sandoval *
81400e04393SOmar Sandoval * Start another batch. Note that this wraps back around to the original
81500e04393SOmar Sandoval * domain if no other domains have requests or tokens.
81600e04393SOmar Sandoval */
81700e04393SOmar Sandoval khd->batching = 0;
81800e04393SOmar Sandoval for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
81900e04393SOmar Sandoval if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
82000e04393SOmar Sandoval khd->cur_domain = 0;
82100e04393SOmar Sandoval else
82200e04393SOmar Sandoval khd->cur_domain++;
82300e04393SOmar Sandoval
824a6088845SJianchao Wang rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
82500e04393SOmar Sandoval if (rq)
82600e04393SOmar Sandoval goto out;
82700e04393SOmar Sandoval }
82800e04393SOmar Sandoval
82900e04393SOmar Sandoval rq = NULL;
83000e04393SOmar Sandoval out:
83100e04393SOmar Sandoval spin_unlock(&khd->lock);
83200e04393SOmar Sandoval return rq;
83300e04393SOmar Sandoval }
83400e04393SOmar Sandoval
kyber_has_work(struct blk_mq_hw_ctx * hctx)83500e04393SOmar Sandoval static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
83600e04393SOmar Sandoval {
83700e04393SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data;
83800e04393SOmar Sandoval int i;
83900e04393SOmar Sandoval
84000e04393SOmar Sandoval for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
841a6088845SJianchao Wang if (!list_empty_careful(&khd->rqs[i]) ||
842a6088845SJianchao Wang sbitmap_any_bit_set(&khd->kcq_map[i]))
84300e04393SOmar Sandoval return true;
84400e04393SOmar Sandoval }
845a6088845SJianchao Wang
846a6088845SJianchao Wang return false;
84700e04393SOmar Sandoval }
84800e04393SOmar Sandoval
8496e25cb01SOmar Sandoval #define KYBER_LAT_SHOW_STORE(domain, name) \
8506e25cb01SOmar Sandoval static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
85100e04393SOmar Sandoval char *page) \
85200e04393SOmar Sandoval { \
85300e04393SOmar Sandoval struct kyber_queue_data *kqd = e->elevator_data; \
85400e04393SOmar Sandoval \
8556e25cb01SOmar Sandoval return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
85600e04393SOmar Sandoval } \
85700e04393SOmar Sandoval \
8586e25cb01SOmar Sandoval static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
85900e04393SOmar Sandoval const char *page, size_t count) \
86000e04393SOmar Sandoval { \
86100e04393SOmar Sandoval struct kyber_queue_data *kqd = e->elevator_data; \
86200e04393SOmar Sandoval unsigned long long nsec; \
86300e04393SOmar Sandoval int ret; \
86400e04393SOmar Sandoval \
86500e04393SOmar Sandoval ret = kstrtoull(page, 10, &nsec); \
86600e04393SOmar Sandoval if (ret) \
86700e04393SOmar Sandoval return ret; \
86800e04393SOmar Sandoval \
8696e25cb01SOmar Sandoval kqd->latency_targets[domain] = nsec; \
87000e04393SOmar Sandoval \
87100e04393SOmar Sandoval return count; \
87200e04393SOmar Sandoval }
8736e25cb01SOmar Sandoval KYBER_LAT_SHOW_STORE(KYBER_READ, read);
8746e25cb01SOmar Sandoval KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
87500e04393SOmar Sandoval #undef KYBER_LAT_SHOW_STORE
87600e04393SOmar Sandoval
87700e04393SOmar Sandoval #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
87800aab2f2SThomas Weißschuh static const struct elv_fs_entry kyber_sched_attrs[] = {
87900e04393SOmar Sandoval KYBER_LAT_ATTR(read),
88000e04393SOmar Sandoval KYBER_LAT_ATTR(write),
88100e04393SOmar Sandoval __ATTR_NULL
88200e04393SOmar Sandoval };
88300e04393SOmar Sandoval #undef KYBER_LAT_ATTR
88400e04393SOmar Sandoval
88516b738f6SOmar Sandoval #ifdef CONFIG_BLK_DEBUG_FS
88616b738f6SOmar Sandoval #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
88716b738f6SOmar Sandoval static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
88816b738f6SOmar Sandoval { \
88916b738f6SOmar Sandoval struct request_queue *q = data; \
89016b738f6SOmar Sandoval struct kyber_queue_data *kqd = q->elevator->elevator_data; \
89116b738f6SOmar Sandoval \
89216b738f6SOmar Sandoval sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
89316b738f6SOmar Sandoval return 0; \
89416b738f6SOmar Sandoval } \
89516b738f6SOmar Sandoval \
89616b738f6SOmar Sandoval static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
89716b738f6SOmar Sandoval __acquires(&khd->lock) \
89816b738f6SOmar Sandoval { \
89916b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = m->private; \
90016b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data; \
90116b738f6SOmar Sandoval \
90216b738f6SOmar Sandoval spin_lock(&khd->lock); \
90316b738f6SOmar Sandoval return seq_list_start(&khd->rqs[domain], *pos); \
90416b738f6SOmar Sandoval } \
90516b738f6SOmar Sandoval \
90616b738f6SOmar Sandoval static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
90716b738f6SOmar Sandoval loff_t *pos) \
90816b738f6SOmar Sandoval { \
90916b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = m->private; \
91016b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data; \
91116b738f6SOmar Sandoval \
91216b738f6SOmar Sandoval return seq_list_next(v, &khd->rqs[domain], pos); \
91316b738f6SOmar Sandoval } \
91416b738f6SOmar Sandoval \
91516b738f6SOmar Sandoval static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
91616b738f6SOmar Sandoval __releases(&khd->lock) \
91716b738f6SOmar Sandoval { \
91816b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = m->private; \
91916b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data; \
92016b738f6SOmar Sandoval \
92116b738f6SOmar Sandoval spin_unlock(&khd->lock); \
92216b738f6SOmar Sandoval } \
92316b738f6SOmar Sandoval \
92416b738f6SOmar Sandoval static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
92516b738f6SOmar Sandoval .start = kyber_##name##_rqs_start, \
92616b738f6SOmar Sandoval .next = kyber_##name##_rqs_next, \
92716b738f6SOmar Sandoval .stop = kyber_##name##_rqs_stop, \
92816b738f6SOmar Sandoval .show = blk_mq_debugfs_rq_show, \
92916b738f6SOmar Sandoval }; \
93016b738f6SOmar Sandoval \
93116b738f6SOmar Sandoval static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
93216b738f6SOmar Sandoval { \
93316b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = data; \
93416b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data; \
93500203ba4SJens Axboe wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
93616b738f6SOmar Sandoval \
9372055da97SIngo Molnar seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
93816b738f6SOmar Sandoval return 0; \
93916b738f6SOmar Sandoval }
KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ,read)94016b738f6SOmar Sandoval KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
9416e25cb01SOmar Sandoval KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
9426e25cb01SOmar Sandoval KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
94316b738f6SOmar Sandoval KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
94416b738f6SOmar Sandoval #undef KYBER_DEBUGFS_DOMAIN_ATTRS
94516b738f6SOmar Sandoval
94616b738f6SOmar Sandoval static int kyber_cur_domain_show(void *data, struct seq_file *m)
94716b738f6SOmar Sandoval {
94816b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = data;
94916b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data;
95016b738f6SOmar Sandoval
9516c3b7af1SOmar Sandoval seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
95216b738f6SOmar Sandoval return 0;
95316b738f6SOmar Sandoval }
95416b738f6SOmar Sandoval
kyber_batching_show(void * data,struct seq_file * m)95516b738f6SOmar Sandoval static int kyber_batching_show(void *data, struct seq_file *m)
95616b738f6SOmar Sandoval {
95716b738f6SOmar Sandoval struct blk_mq_hw_ctx *hctx = data;
95816b738f6SOmar Sandoval struct kyber_hctx_data *khd = hctx->sched_data;
95916b738f6SOmar Sandoval
96016b738f6SOmar Sandoval seq_printf(m, "%u\n", khd->batching);
96116b738f6SOmar Sandoval return 0;
96216b738f6SOmar Sandoval }
96316b738f6SOmar Sandoval
96416b738f6SOmar Sandoval #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
96516b738f6SOmar Sandoval {#name "_tokens", 0400, kyber_##name##_tokens_show}
96616b738f6SOmar Sandoval static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
96716b738f6SOmar Sandoval KYBER_QUEUE_DOMAIN_ATTRS(read),
9686e25cb01SOmar Sandoval KYBER_QUEUE_DOMAIN_ATTRS(write),
9696e25cb01SOmar Sandoval KYBER_QUEUE_DOMAIN_ATTRS(discard),
97016b738f6SOmar Sandoval KYBER_QUEUE_DOMAIN_ATTRS(other),
97116b738f6SOmar Sandoval {},
97216b738f6SOmar Sandoval };
97316b738f6SOmar Sandoval #undef KYBER_QUEUE_DOMAIN_ATTRS
97416b738f6SOmar Sandoval
97516b738f6SOmar Sandoval #define KYBER_HCTX_DOMAIN_ATTRS(name) \
97616b738f6SOmar Sandoval {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
97716b738f6SOmar Sandoval {#name "_waiting", 0400, kyber_##name##_waiting_show}
97816b738f6SOmar Sandoval static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
97916b738f6SOmar Sandoval KYBER_HCTX_DOMAIN_ATTRS(read),
9806e25cb01SOmar Sandoval KYBER_HCTX_DOMAIN_ATTRS(write),
9816e25cb01SOmar Sandoval KYBER_HCTX_DOMAIN_ATTRS(discard),
98216b738f6SOmar Sandoval KYBER_HCTX_DOMAIN_ATTRS(other),
98316b738f6SOmar Sandoval {"cur_domain", 0400, kyber_cur_domain_show},
98416b738f6SOmar Sandoval {"batching", 0400, kyber_batching_show},
98516b738f6SOmar Sandoval {},
98616b738f6SOmar Sandoval };
98716b738f6SOmar Sandoval #undef KYBER_HCTX_DOMAIN_ATTRS
98816b738f6SOmar Sandoval #endif
98916b738f6SOmar Sandoval
99000e04393SOmar Sandoval static struct elevator_type kyber_sched = {
991f9cd4bfeSJens Axboe .ops = {
99200e04393SOmar Sandoval .init_sched = kyber_init_sched,
99300e04393SOmar Sandoval .exit_sched = kyber_exit_sched,
99400e04393SOmar Sandoval .init_hctx = kyber_init_hctx,
99500e04393SOmar Sandoval .exit_hctx = kyber_exit_hctx,
996d4c3ef56SNilay Shroff .alloc_sched_data = kyber_alloc_sched_data,
997d4c3ef56SNilay Shroff .free_sched_data = kyber_free_sched_data,
9985bbf4e5aSChristoph Hellwig .limit_depth = kyber_limit_depth,
999a6088845SJianchao Wang .bio_merge = kyber_bio_merge,
10005bbf4e5aSChristoph Hellwig .prepare_request = kyber_prepare_request,
1001a6088845SJianchao Wang .insert_requests = kyber_insert_requests,
10027b9e9361SChristoph Hellwig .finish_request = kyber_finish_request,
1003ba989a01SMing Lei .requeue_request = kyber_finish_request,
100400e04393SOmar Sandoval .completed_request = kyber_completed_request,
100500e04393SOmar Sandoval .dispatch_request = kyber_dispatch_request,
100600e04393SOmar Sandoval .has_work = kyber_has_work,
1007ffa772cfSYang Yang .depth_updated = kyber_depth_updated,
100800e04393SOmar Sandoval },
100916b738f6SOmar Sandoval #ifdef CONFIG_BLK_DEBUG_FS
101016b738f6SOmar Sandoval .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
101116b738f6SOmar Sandoval .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
101216b738f6SOmar Sandoval #endif
101300e04393SOmar Sandoval .elevator_attrs = kyber_sched_attrs,
101400e04393SOmar Sandoval .elevator_name = "kyber",
101500e04393SOmar Sandoval .elevator_owner = THIS_MODULE,
101600e04393SOmar Sandoval };
101700e04393SOmar Sandoval
kyber_init(void)101800e04393SOmar Sandoval static int __init kyber_init(void)
101900e04393SOmar Sandoval {
102000e04393SOmar Sandoval return elv_register(&kyber_sched);
102100e04393SOmar Sandoval }
102200e04393SOmar Sandoval
kyber_exit(void)102300e04393SOmar Sandoval static void __exit kyber_exit(void)
102400e04393SOmar Sandoval {
102500e04393SOmar Sandoval elv_unregister(&kyber_sched);
102600e04393SOmar Sandoval }
102700e04393SOmar Sandoval
102800e04393SOmar Sandoval module_init(kyber_init);
102900e04393SOmar Sandoval module_exit(kyber_exit);
103000e04393SOmar Sandoval
103100e04393SOmar Sandoval MODULE_AUTHOR("Omar Sandoval");
103200e04393SOmar Sandoval MODULE_LICENSE("GPL");
103300e04393SOmar Sandoval MODULE_DESCRIPTION("Kyber I/O scheduler");
1034