block/bfq-cgroup.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936

/*
 * BFQ: CGROUPS support.
 *
 * Based on ideas and code from CFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *		      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
 *
 * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
 * file.
 */

#ifdef CONFIG_CGROUP_BFQIO

static DEFINE_MUTEX(bfqio_mutex);

static bool bfqio_is_removed(struct bfqio_cgroup *bgrp)
{
	return bgrp ? !bgrp->online : false;
}

static struct bfqio_cgroup bfqio_root_cgroup = {
	.weight = BFQ_DEFAULT_GRP_WEIGHT,
	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
};

static inline void bfq_init_entity(struct bfq_entity *entity,
				   struct bfq_group *bfqg)
{
	entity->weight = entity->new_weight;
	entity->orig_weight = entity->new_weight;
	entity->ioprio = entity->new_ioprio;
	entity->ioprio_class = entity->new_ioprio_class;
	entity->parent = bfqg->my_entity;
	entity->sched_data = &bfqg->sched_data;
}

static struct bfqio_cgroup *css_to_bfqio(struct cgroup_subsys_state *css)
{
	return css ? container_of(css, struct bfqio_cgroup, css) : NULL;
}

/*
 * Search the bfq_group for bfqd into the hash table (by now only a list)
 * of bgrp.  Must be called under rcu_read_lock().
 */
static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
					    struct bfq_data *bfqd)
{
	struct bfq_group *bfqg;
	void *key;

	hlist_for_each_entry_rcu(bfqg, &bgrp->group_data, group_node) {
		key = rcu_dereference(bfqg->bfqd);
		if (key == bfqd)
			return bfqg;
	}

	return NULL;
}

static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
					 struct bfq_group *bfqg)
{
	struct bfq_entity *entity = &bfqg->entity;

	/*
	 * If the weight of the entity has never been set via the sysfs
	 * interface, then bgrp->weight == 0. In this case we initialize
	 * the weight from the current ioprio value. Otherwise, the group
	 * weight, if set, has priority over the ioprio value.
	 */
	if (bgrp->weight == 0) {
		entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio);
		entity->new_ioprio = bgrp->ioprio;
	} else {
		if (bgrp->weight < BFQ_MIN_WEIGHT ||
		    bgrp->weight > BFQ_MAX_WEIGHT) {
			printk(KERN_CRIT "bfq_group_init_entity: "
					 "bgrp->weight %d\n", bgrp->weight);
			BUG();
		}
		entity->new_weight = bgrp->weight;
		entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight);
	}
	entity->orig_weight = entity->weight = entity->new_weight;
	entity->ioprio = entity->new_ioprio;
	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
	entity->my_sched_data = &bfqg->sched_data;
	bfqg->active_entities = 0;
}

static inline void bfq_group_set_parent(struct bfq_group *bfqg,
					struct bfq_group *parent)
{
	struct bfq_entity *entity;

	BUG_ON(parent == NULL);
	BUG_ON(bfqg == NULL);

	entity = &bfqg->entity;
	entity->parent = parent->my_entity;
	entity->sched_data = &parent->sched_data;
}

/**
 * bfq_group_chain_alloc - allocate a chain of groups.
 * @bfqd: queue descriptor.
 * @css: the leaf cgroup_subsys_state this chain starts from.
 *
 * Allocate a chain of groups starting from the one belonging to
 * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
 * to the root has already an allocated group on @bfqd.
 */
static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
					       struct cgroup_subsys_state *css)
{
	struct bfqio_cgroup *bgrp;
	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;

	for (; css != NULL; css = css->parent) {
		bgrp = css_to_bfqio(css);

		bfqg = bfqio_lookup_group(bgrp, bfqd);
		if (bfqg != NULL) {
			/*
			 * All the cgroups in the path from there to the
			 * root must have a bfq_group for bfqd, so we don't
			 * need any more allocations.
			 */
			break;
		}

		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
		if (bfqg == NULL)
			goto cleanup;

		bfq_group_init_entity(bgrp, bfqg);
		bfqg->my_entity = &bfqg->entity;

		if (leaf == NULL) {
			leaf = bfqg;
			prev = leaf;
		} else {
			bfq_group_set_parent(prev, bfqg);
			/*
			 * Build a list of allocated nodes using the bfqd
			 * filed, that is still unused and will be
			 * initialized only after the node will be
			 * connected.
			 */
			prev->bfqd = bfqg;
			prev = bfqg;
		}
	}

	return leaf;

cleanup:
	while (leaf != NULL) {
		prev = leaf;
		leaf = leaf->bfqd;
		kfree(prev);
	}

	return NULL;
}

/**
 * bfq_group_chain_link - link an allocated group chain to a cgroup
 *                        hierarchy.
 * @bfqd: the queue descriptor.
 * @css: the leaf cgroup_subsys_state to start from.
 * @leaf: the leaf group (to be associated to @cgroup).
 *
 * Try to link a chain of groups to a cgroup hierarchy, connecting the
 * nodes bottom-up, so we can be sure that when we find a cgroup in the
 * hierarchy that already as a group associated to @bfqd all the nodes
 * in the path to the root cgroup have one too.
 *
 * On locking: the queue lock protects the hierarchy (there is a hierarchy
 * per device) while the bfqio_cgroup lock protects the list of groups
 * belonging to the same cgroup.
 */
static void bfq_group_chain_link(struct bfq_data *bfqd,
				 struct cgroup_subsys_state *css,
				 struct bfq_group *leaf)
{
	struct bfqio_cgroup *bgrp;
	struct bfq_group *bfqg, *next, *prev = NULL;
	unsigned long flags;

	assert_spin_locked(bfqd->queue->queue_lock);

	for (; css != NULL && leaf != NULL; css = css->parent) {
		bgrp = css_to_bfqio(css);
		next = leaf->bfqd;

		bfqg = bfqio_lookup_group(bgrp, bfqd);
		BUG_ON(bfqg != NULL);

		spin_lock_irqsave(&bgrp->lock, flags);

		rcu_assign_pointer(leaf->bfqd, bfqd);
		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);

		spin_unlock_irqrestore(&bgrp->lock, flags);

		prev = leaf;
		leaf = next;
	}

	BUG_ON(css == NULL && leaf != NULL);
	if (css != NULL && prev != NULL) {
		bgrp = css_to_bfqio(css);
		bfqg = bfqio_lookup_group(bgrp, bfqd);
		bfq_group_set_parent(prev, bfqg);
	}
}

/**
 * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
 * @bfqd: queue descriptor.
 * @cgroup: cgroup being searched for.
 *
 * Return a group associated to @bfqd in @cgroup, allocating one if
 * necessary.  When a group is returned all the cgroups in the path
 * to the root have a group associated to @bfqd.
 *
 * If the allocation fails, return the root group: this breaks guarantees
 * but is a safe fallback.  If this loss becomes a problem it can be
 * mitigated using the equivalent weight (given by the product of the
 * weights of the groups in the path from @group to the root) in the
 * root scheduler.
 *
 * We allocate all the missing nodes in the path from the leaf cgroup
 * to the root and we connect the nodes only after all the allocations
 * have been successful.
 */
static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
					      struct cgroup_subsys_state *css)
{
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
	struct bfq_group *bfqg;

	bfqg = bfqio_lookup_group(bgrp, bfqd);
	if (bfqg != NULL)
		return bfqg;

	bfqg = bfq_group_chain_alloc(bfqd, css);
	if (bfqg != NULL)
		bfq_group_chain_link(bfqd, css, bfqg);
	else
		bfqg = bfqd->root_group;

	return bfqg;
}

/**
 * bfq_bfqq_move - migrate @bfqq to @bfqg.
 * @bfqd: queue descriptor.
 * @bfqq: the queue to move.
 * @entity: @bfqq's entity.
 * @bfqg: the group to move to.
 *
 * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
 * it on the new one.  Avoid putting the entity on the old group idle tree.
 *
 * Must be called under the queue lock; the cgroup owning @bfqg must
 * not disappear (by now this just means that we are called under
 * rcu_read_lock()).
 */
static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
			  struct bfq_entity *entity, struct bfq_group *bfqg)
{
	int busy, resume;

	busy = bfq_bfqq_busy(bfqq);
	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);

	BUG_ON(resume && !entity->on_st);
	BUG_ON(busy && !resume && entity->on_st &&
	       bfqq != bfqd->in_service_queue);

	if (busy) {
		BUG_ON(atomic_read(&bfqq->ref) < 2);

		if (!resume)
			bfq_del_bfqq_busy(bfqd, bfqq, 0);
		else
			bfq_deactivate_bfqq(bfqd, bfqq, 0);
	} else if (entity->on_st)
		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);

	/*
	 * Here we use a reference to bfqg.  We don't need a refcounter
	 * as the cgroup reference will not be dropped, so that its
	 * destroy() callback will not be invoked.
	 */
	entity->parent = bfqg->my_entity;
	entity->sched_data = &bfqg->sched_data;

	if (busy && resume)
		bfq_activate_bfqq(bfqd, bfqq);

	if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver)
		bfq_schedule_dispatch(bfqd);
}

/**
 * __bfq_bic_change_cgroup - move @bic to @cgroup.
 * @bfqd: the queue descriptor.
 * @bic: the bic to move.
 * @cgroup: the cgroup to move to.
 *
 * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
 * has to make sure that the reference to cgroup is valid across the call.
 *
 * NOTE: an alternative approach might have been to store the current
 * cgroup in bfqq and getting a reference to it, reducing the lookup
 * time here, at the price of slightly more complex code.
 */
static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
						struct bfq_io_cq *bic,
						struct cgroup_subsys_state *css)
{
	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
	struct bfq_entity *entity;
	struct bfq_group *bfqg;
	struct bfqio_cgroup *bgrp;

	bgrp = css_to_bfqio(css);

	bfqg = bfq_find_alloc_group(bfqd, css);
	if (async_bfqq != NULL) {
		entity = &async_bfqq->entity;

		if (entity->sched_data != &bfqg->sched_data) {
			bic_set_bfqq(bic, NULL, 0);
			bfq_log_bfqq(bfqd, async_bfqq,
				     "bic_change_group: %p %d",
				     async_bfqq, atomic_read(&async_bfqq->ref));
			bfq_put_queue(async_bfqq);
		}
	}

	if (sync_bfqq != NULL) {
		entity = &sync_bfqq->entity;
		if (entity->sched_data != &bfqg->sched_data)
			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
	}

	return bfqg;
}

/**
 * bfq_bic_change_cgroup - move @bic to @cgroup.
 * @bic: the bic being migrated.
 * @cgroup: the destination cgroup.
 *
 * When the task owning @bic is moved to @cgroup, @bic is immediately
 * moved into its new parent group.
 */
static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
				  struct cgroup_subsys_state *css)
{
	struct bfq_data *bfqd;
	unsigned long uninitialized_var(flags);

	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
				   &flags);
	if (bfqd != NULL) {
		__bfq_bic_change_cgroup(bfqd, bic, css);
		bfq_put_bfqd_unlock(bfqd, &flags);
	}
}

/**
 * bfq_bic_update_cgroup - update the cgroup of @bic.
 * @bic: the @bic to update.
 *
 * Make sure that @bic is enqueued in the cgroup of the current task.
 * We need this in addition to moving bics during the cgroup attach
 * phase because the task owning @bic could be at its first disk
 * access or we may end up in the root cgroup as the result of a
 * memory allocation failure and here we try to move to the right
 * group.
 *
 * Must be called under the queue lock.  It is safe to use the returned
 * value even after the rcu_read_unlock() as the migration/destruction
 * paths act under the queue lock too.  IOW it is impossible to race with
 * group migration/destruction and end up with an invalid group as:
 *   a) here cgroup has not yet been destroyed, nor its destroy callback
 *      has started execution, as current holds a reference to it,
 *   b) if it is destroyed after rcu_read_unlock() [after current is
 *      migrated to a different cgroup] its attach() callback will have
 *      taken care of remove all the references to the old cgroup data.
 */
static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
{
	struct bfq_data *bfqd = bic_to_bfqd(bic);
	struct bfq_group *bfqg;
	struct cgroup_subsys_state *css;

	BUG_ON(bfqd == NULL);

	rcu_read_lock();
	css = task_css(current, bfqio_cgrp_id);
	bfqg = __bfq_bic_change_cgroup(bfqd, bic, css);
	rcu_read_unlock();

	return bfqg;
}

/**
 * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
 * @st: the service tree being flushed.
 */
static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
{
	struct bfq_entity *entity = st->first_idle;

	for (; entity != NULL; entity = st->first_idle)
		__bfq_deactivate_entity(entity, 0);
}

/**
 * bfq_reparent_leaf_entity - move leaf entity to the root_group.
 * @bfqd: the device data structure with the root group.
 * @entity: the entity to move.
 */
static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
					    struct bfq_entity *entity)
{
	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);

	BUG_ON(bfqq == NULL);
	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
	return;
}

/**
 * bfq_reparent_active_entities - move to the root group all active
 *                                entities.
 * @bfqd: the device data structure with the root group.
 * @bfqg: the group to move from.
 * @st: the service tree with the entities.
 *
 * Needs queue_lock to be taken and reference to be valid over the call.
 */
static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
						struct bfq_group *bfqg,
						struct bfq_service_tree *st)
{
	struct rb_root *active = &st->active;
	struct bfq_entity *entity = NULL;

	if (!RB_EMPTY_ROOT(&st->active))
		entity = bfq_entity_of(rb_first(active));

	for (; entity != NULL; entity = bfq_entity_of(rb_first(active)))
		bfq_reparent_leaf_entity(bfqd, entity);

	if (bfqg->sched_data.in_service_entity != NULL)
		bfq_reparent_leaf_entity(bfqd,
			bfqg->sched_data.in_service_entity);

	return;
}

/**
 * bfq_destroy_group - destroy @bfqg.
 * @bgrp: the bfqio_cgroup containing @bfqg.
 * @bfqg: the group being destroyed.
 *
 * Destroy @bfqg, making sure that it is not referenced from its parent.
 */
static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
{
	struct bfq_data *bfqd;
	struct bfq_service_tree *st;
	struct bfq_entity *entity = bfqg->my_entity;
	unsigned long uninitialized_var(flags);
	int i;

	hlist_del(&bfqg->group_node);

	/*
	 * Empty all service_trees belonging to this group before
	 * deactivating the group itself.
	 */
	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
		st = bfqg->sched_data.service_tree + i;

		/*
		 * The idle tree may still contain bfq_queues belonging
		 * to exited task because they never migrated to a different
		 * cgroup from the one being destroyed now.  No one else
		 * can access them so it's safe to act without any lock.
		 */
		bfq_flush_idle_tree(st);

		/*
		 * It may happen that some queues are still active
		 * (busy) upon group destruction (if the corresponding
		 * processes have been forced to terminate). We move
		 * all the leaf entities corresponding to these queues
		 * to the root_group.
		 * Also, it may happen that the group has an entity
		 * in service, which is disconnected from the active
		 * tree: it must be moved, too.
		 * There is no need to put the sync queues, as the
		 * scheduler has taken no reference.
		 */
		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
		if (bfqd != NULL) {
			bfq_reparent_active_entities(bfqd, bfqg, st);
			bfq_put_bfqd_unlock(bfqd, &flags);
		}
		BUG_ON(!RB_EMPTY_ROOT(&st->active));
		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
	}
	BUG_ON(bfqg->sched_data.next_in_service != NULL);
	BUG_ON(bfqg->sched_data.in_service_entity != NULL);

	/*
	 * We may race with device destruction, take extra care when
	 * dereferencing bfqg->bfqd.
	 */
	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
	if (bfqd != NULL) {
		hlist_del(&bfqg->bfqd_node);
		__bfq_deactivate_entity(entity, 0);
		bfq_put_async_queues(bfqd, bfqg);
		bfq_put_bfqd_unlock(bfqd, &flags);
	}
	BUG_ON(entity->tree != NULL);

	/*
	 * No need to defer the kfree() to the end of the RCU grace
	 * period: we are called from the destroy() callback of our
	 * cgroup, so we can be sure that no one is a) still using
	 * this cgroup or b) doing lookups in it.
	 */
	kfree(bfqg);
}

static void bfq_end_wr_async(struct bfq_data *bfqd)
{
	struct hlist_node *tmp;
	struct bfq_group *bfqg;

	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node)
		bfq_end_wr_async_queues(bfqd, bfqg);
	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}

/**
 * bfq_disconnect_groups - disconnect @bfqd from all its groups.
 * @bfqd: the device descriptor being exited.
 *
 * When the device exits we just make sure that no lookup can return
 * the now unused group structures.  They will be deallocated on cgroup
 * destruction.
 */
static void bfq_disconnect_groups(struct bfq_data *bfqd)
{
	struct hlist_node *tmp;
	struct bfq_group *bfqg;

	bfq_log(bfqd, "disconnect_groups beginning");
	hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) {
		hlist_del(&bfqg->bfqd_node);

		__bfq_deactivate_entity(bfqg->my_entity, 0);

		/*
		 * Don't remove from the group hash, just set an
		 * invalid key.  No lookups can race with the
		 * assignment as bfqd is being destroyed; this
		 * implies also that new elements cannot be added
		 * to the list.
		 */
		rcu_assign_pointer(bfqg->bfqd, NULL);

		bfq_log(bfqd, "disconnect_groups: put async for group %p",
			bfqg);
		bfq_put_async_queues(bfqd, bfqg);
	}
}

static inline void bfq_free_root_group(struct bfq_data *bfqd)
{
	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
	struct bfq_group *bfqg = bfqd->root_group;

	bfq_put_async_queues(bfqd, bfqg);

	spin_lock_irq(&bgrp->lock);
	hlist_del_rcu(&bfqg->group_node);
	spin_unlock_irq(&bgrp->lock);

	/*
	 * No need to synchronize_rcu() here: since the device is gone
	 * there cannot be any read-side access to its root_group.
	 */
	kfree(bfqg);
}

static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
{
	struct bfq_group *bfqg;
	struct bfqio_cgroup *bgrp;
	int i;

	bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node);
	if (bfqg == NULL)
		return NULL;

	bfqg->entity.parent = NULL;
	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

	bgrp = &bfqio_root_cgroup;
	spin_lock_irq(&bgrp->lock);
	rcu_assign_pointer(bfqg->bfqd, bfqd);
	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
	spin_unlock_irq(&bgrp->lock);

	return bfqg;
}

#define SHOW_FUNCTION(__VAR)						\
static u64 bfqio_cgroup_##__VAR##_read(struct cgroup_subsys_state *css, \
				       struct cftype *cftype)		\
{									\
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
	u64 ret = -ENODEV;						\
									\
	mutex_lock(&bfqio_mutex);					\
	if (bfqio_is_removed(bgrp))					\
		goto out_unlock;					\
									\
	spin_lock_irq(&bgrp->lock);					\
	ret = bgrp->__VAR;						\
	spin_unlock_irq(&bgrp->lock);					\
									\
out_unlock:								\
	mutex_unlock(&bfqio_mutex);					\
	return ret;							\
}

SHOW_FUNCTION(weight);
SHOW_FUNCTION(ioprio);
SHOW_FUNCTION(ioprio_class);
#undef SHOW_FUNCTION

#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
static int bfqio_cgroup_##__VAR##_write(struct cgroup_subsys_state *css,\
					struct cftype *cftype,		\
					u64 val)			\
{									\
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);			\
	struct bfq_group *bfqg;						\
	int ret = -EINVAL;						\
									\
	if (val < (__MIN) || val > (__MAX))				\
		return ret;						\
									\
	ret = -ENODEV;							\
	mutex_lock(&bfqio_mutex);					\
	if (bfqio_is_removed(bgrp))					\
		goto out_unlock;					\
	ret = 0;							\
									\
	spin_lock_irq(&bgrp->lock);					\
	bgrp->__VAR = (unsigned short)val;				\
	hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) {	\
		/*							\
		 * Setting the ioprio_changed flag of the entity        \
		 * to 1 with new_##__VAR == ##__VAR would re-set        \
		 * the value of the weight to its ioprio mapping.       \
		 * Set the flag only if necessary.			\
		 */							\
		if ((unsigned short)val != bfqg->entity.new_##__VAR) {  \
			bfqg->entity.new_##__VAR = (unsigned short)val; \
			/*						\
			 * Make sure that the above new value has been	\
			 * stored in bfqg->entity.new_##__VAR before	\
			 * setting the ioprio_changed flag. In fact,	\
			 * this flag may be read asynchronously (in	\
			 * critical sections protected by a different	\
			 * lock than that held here), and finding this	\
			 * flag set may cause the execution of the code	\
			 * for updating parameters whose value may	\
			 * depend also on bfqg->entity.new_##__VAR (in	\
			 * __bfq_entity_update_weight_prio).		\
			 * This barrier makes sure that the new value	\
			 * of bfqg->entity.new_##__VAR is correctly	\
			 * seen in that code.				\
			 */						\
			smp_wmb();                                      \
			bfqg->entity.ioprio_changed = 1;                \
		}							\
	}								\
	spin_unlock_irq(&bgrp->lock);					\
									\
out_unlock:								\
	mutex_unlock(&bfqio_mutex);					\
	return ret;							\
}

STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
#undef STORE_FUNCTION

static struct cftype bfqio_files[] = {
	{
		.name = "weight",
		.read_u64 = bfqio_cgroup_weight_read,
		.write_u64 = bfqio_cgroup_weight_write,
	},
	{
		.name = "ioprio",
		.read_u64 = bfqio_cgroup_ioprio_read,
		.write_u64 = bfqio_cgroup_ioprio_write,
	},
	{
		.name = "ioprio_class",
		.read_u64 = bfqio_cgroup_ioprio_class_read,
		.write_u64 = bfqio_cgroup_ioprio_class_write,
	},
	{ },	/* terminate */
};

static struct cgroup_subsys_state *bfqio_create(struct cgroup_subsys_state
						*parent_css)
{
	struct bfqio_cgroup *bgrp;

	if (parent_css != NULL) {
		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
		if (bgrp == NULL)
			return ERR_PTR(-ENOMEM);
	} else
		bgrp = &bfqio_root_cgroup;

	spin_lock_init(&bgrp->lock);
	INIT_HLIST_HEAD(&bgrp->group_data);
	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;

	return &bgrp->css;
}

/*
 * We cannot support shared io contexts, as we have no means to support
 * two tasks with the same ioc in two different groups without major rework
 * of the main bic/bfqq data structures.  By now we allow a task to change
 * its cgroup only if it's the only owner of its ioc; the drawback of this
 * behavior is that a group containing a task that forked using CLONE_IO
 * will not be destroyed until the tasks sharing the ioc die.
 */
static int bfqio_can_attach(struct cgroup_subsys_state *css,
			    struct cgroup_taskset *tset)
{
	struct task_struct *task;
	struct io_context *ioc;
	int ret = 0;

	cgroup_taskset_for_each(task, tset) {
		/*
		 * task_lock() is needed to avoid races with
		 * exit_io_context()
		 */
		task_lock(task);
		ioc = task->io_context;
		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
			/*
			 * ioc == NULL means that the task is either too
			 * young or exiting: if it has still no ioc the
			 * ioc can't be shared, if the task is exiting the
			 * attach will fail anyway, no matter what we
			 * return here.
			 */
			ret = -EINVAL;
		task_unlock(task);
		if (ret)
			break;
	}

	return ret;
}

static void bfqio_attach(struct cgroup_subsys_state *css,
			 struct cgroup_taskset *tset)
{
	struct task_struct *task;
	struct io_context *ioc;
	struct io_cq *icq;

	/*
	 * IMPORTANT NOTE: The move of more than one process at a time to a
	 * new group has not yet been tested.
	 */
	cgroup_taskset_for_each(task, tset) {
		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
		if (ioc) {
			/*
			 * Handle cgroup change here.
			 */
			rcu_read_lock();
			hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node)
				if (!strncmp(
					icq->q->elevator->type->elevator_name,
					"bfq", ELV_NAME_MAX))
					bfq_bic_change_cgroup(icq_to_bic(icq),
							      css);
			rcu_read_unlock();
			put_io_context(ioc);
		}
	}
}

static void bfqio_destroy(struct cgroup_subsys_state *css)
{
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);
	struct hlist_node *tmp;
	struct bfq_group *bfqg;

	/*
	 * Since we are destroying the cgroup, there are no more tasks
	 * referencing it, and all the RCU grace periods that may have
	 * referenced it are ended (as the destruction of the parent
	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
	 * anything else and we don't need any synchronization.
	 */
	hlist_for_each_entry_safe(bfqg, tmp, &bgrp->group_data, group_node)
		bfq_destroy_group(bgrp, bfqg);

	BUG_ON(!hlist_empty(&bgrp->group_data));

	kfree(bgrp);
}

static int bfqio_css_online(struct cgroup_subsys_state *css)
{
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

	mutex_lock(&bfqio_mutex);
	bgrp->online = true;
	mutex_unlock(&bfqio_mutex);

	return 0;
}

static void bfqio_css_offline(struct cgroup_subsys_state *css)
{
	struct bfqio_cgroup *bgrp = css_to_bfqio(css);

	mutex_lock(&bfqio_mutex);
	bgrp->online = false;
	mutex_unlock(&bfqio_mutex);
}

struct cgroup_subsys bfqio_cgrp_subsys = {
	.css_alloc = bfqio_create,
	.css_online = bfqio_css_online,
	.css_offline = bfqio_css_offline,
	.can_attach = bfqio_can_attach,
	.attach = bfqio_attach,
	.css_free = bfqio_destroy,
	.legacy_cftypes = bfqio_files,
};
#else
static inline void bfq_init_entity(struct bfq_entity *entity,
				   struct bfq_group *bfqg)
{
	entity->weight = entity->new_weight;
	entity->orig_weight = entity->new_weight;
	entity->ioprio = entity->new_ioprio;
	entity->ioprio_class = entity->new_ioprio_class;
	entity->sched_data = &bfqg->sched_data;
}

static inline struct bfq_group *
bfq_bic_update_cgroup(struct bfq_io_cq *bic)
{
	struct bfq_data *bfqd = bic_to_bfqd(bic);
	return bfqd->root_group;
}

static inline void bfq_bfqq_move(struct bfq_data *bfqd,
				 struct bfq_queue *bfqq,
				 struct bfq_entity *entity,
				 struct bfq_group *bfqg)
{
}

static void bfq_end_wr_async(struct bfq_data *bfqd)
{
	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}

static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
{
	bfq_put_async_queues(bfqd, bfqd->root_group);
}

static inline void bfq_free_root_group(struct bfq_data *bfqd)
{
	kfree(bfqd->root_group);
}

static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
{
	struct bfq_group *bfqg;
	int i;

	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
	if (bfqg == NULL)
		return NULL;

	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;

	return bfqg;
}
#endif